# Calculating 13 Rubine Features for sketches

In [1]:
import pandas as pd
import numpy as np
from math import *
import os

### Rf01(cosine) and Rf02(sine) of Starting Angle

<b> Starting angle is defined as the angle between the first and the third points of a sketch. </b><br>
Let, ($x_{0}$, $y_{0}$) be the starting point and ($x_{2}$, $y_{2}$) be  the third point of a sketch <br>
Then,<br>
&#8710;$y$ = $y_{2}-y_{0}$ <br>
&#8710;$x$ = $x_{2}-x_{0}$ <br>
and hypotenuse ($hyp$) = &#8730;((&#8710;$y$)&#178; + (&#8710;$x$)&#178;) <br>

Let, &#920; be the starting angle <br>
So, Rf01 = $cos$&#920; = &#8710;$x$/$hyp$<br>
and Rf02 = $sin$&#920; = &#8710;$y$/$hyp$<br>

<b><i>Precaution:</b></i><br>
To avoid any situation such as divide by zero, in the case that &#8710;$y$ and &#8710;$x$ both are 0, i.e. the sketcher traces back the third point of the sketch to the first point or the first 3 points are at the same point, `np.divide()` is used which returns `nan` when we get a case of divide by zero instead of throwing an error.

In [2]:
def calc_f01_f02(data):
    """
    This function calculates Rf01 and Rf02
    Rf01 is the cosine of the starting angle and Rf02 is the sine of the starting angle
    Starting angle is the angle between the first point (x0, y0) and the third point (x2, y2)

    :param data: Pandas Dataframe containing all the sketch details of x,y and t
    :type data: Pandas Dataframe
    :return: Rf01 and Rf02
    :rtype: Double rounded to 12 digits after decimal places
    """
    del_init_y, del_init_x = data.iloc[2,1] - data.iloc[0,1], data.iloc[2,0] - data.iloc[0,0] #finding the delta_y and delta_x between the 1st and 3rd point of a sketch
    init_hyp = sqrt(np.square(del_init_y) + np.square(del_init_x)) #finding the hypotenuse between the 1st and 3rd point
    # print(dely, delx, hyp)
    f01 = np.divide(del_init_x,init_hyp) #Taking into conideration of divide by zero
    f02 = np.divide(del_init_y,init_hyp) #Taking into conideration of divide by zero
    return round(f01, 12), round(f02, 12)

### Rf03(Bounding Box Diagonal Length) and Rf04(Bounding Box Angle)

<b> Bounding box is defined as the the smallest rectangle with vertical and horizontal sides that completely surrounds an image or an object.</b><br>
Let, ($x_{min}$, $y_{max}$) and ($x_{max}$, $y_{min}$) be the endpoints of the diagonal of the bounding box.<br>
Then,<br>
&#8710;$y$ = $y_{max}-y_{min}$ <br>
&#8710;$x$ = $x_{max}-x_{min}$ <br>
So, Rf03 = Bounding Box diagonal = &#8730;((&#8710;$y$)&#178; + (&#8710;$x$)&#178;) <br>

Bounding box angle is the angle between the bounding box diagonal and the x-axis.<br>
Let, &#920; be the bounding box angle<br>
So, Rf04 = &#920; = $arctan$(&#8710;$y$/&#8710;$x$)

<b><i>Precaution:</b></i><br>
Finding the angle involves finding the `arctan` of the angle between the x-axis and the bounding box diagonal. However, the `arctan` is defined only within the 1st and 4th quadrant, i.e. (-&#928;/2, &#928;/2). So, to make it defined in all the quadrant, i.e. (-&#928;, &#928;), we have to use `atan2`.

In [3]:
def calc_f03_f04(data):
    """
    This function calculates Rf03 and Rf04
    Rf03 is the bounding box diagonal between (xmin, ymax) and (xmax, ymin)
    Rf04 is the bounding box angle - The angle bwteen the bounding box diagonal and x-axis

    :param data: Pandas Dataframe containing all the sketch details of x,y and t
    :type data: Pandas Dataframe
    :return: Rf03 and Rf04
    :rtype: Double rounded to 12 digits after decimal places
    """

    x_min = data['x'].min() #Calculating xmin
    x_max = data['x'].max() #Calculating xmax
    y_min = data['y'].min() #Calculating ymin
    y_max = data['y'].max() #Calculating ymax
    # print(x_min, y_max, x_max, y_min)

    del_diag_y, del_diag_x = y_max - y_min, x_max - x_min
    f03 = sqrt(np.square(del_diag_x)+np.square(del_diag_y))
    f04 = atan2(del_diag_y, del_diag_x) #Taking care of the domain of arctan
    
    return round(f03, 12), round(f04, 12)

### Rf05(Distance between Endpoints), Rf06(Cosine of angle between endpoints), Rf07(sine of angle between endpoints)

Let, ($x_{0}$, $y_{0}$) be the starting point and ($x_{n-1}$, $y_{n-1}$) be  the end point of a sketch <br>
Then,<br>
&#8710;$y$ = $y_{n-1}-y_{0}$ <br>
&#8710;$x$ = $x_{n-1}-x_{0}$ <br>
So, Rf05 = &#8730;((&#8710;$y$)&#178; + (&#8710;$x$)&#178;) <br>

Let, &#920; be the angle between the endpoints<br>
So, Rf06 = $cos$&#920; = &#8710;$x$/Rf05<br>
and Rf07 = $sin$&#920; = &#8710;$y$/Rf05<br>

<b><i>Precaution:</b></i><br>
To avoid any situation such as divide by zero, in the case that &#8710;y and &#8710;x both are 0, i.e. the sketcher traces back the last point of the sketch to the first point or all the points are at the same point, `np.divide()` is used which returns `nan` when we get a case of divide by zero instead of throwing an error.

In [4]:
def calc_f05_f06_f07(data):
    """
    This function calculates Rf05, Rf06, and Rf07
    Rf05 is the distance between the end points (x0,y0) and (xn-1, yn-1)
    Rf06 and R07 are respectively the cosine and sine of the angle between the endpoints


    :param data: Pandas Dataframe containing all the sketch details of x,y and t
    :type data: Pandas Dataframe
    :return: Rf05, Rf06, Rf07
    :rtype: Double rounded to 12 digits after decimal places
    """
    del_ep_y, del_ep_x = data.iloc[-1,1] - data.iloc[0,1], data.iloc[-1,0] - data.iloc[0,0] #Calculating the deltay and deltax between the endpoints
    f05 = sqrt(np.square(del_ep_x) + np.square(del_ep_y)) #Calculating distance between endpoints
    # print(f05)
    f06 = np.divide(del_ep_x, f05) #Calculating cosine of angle between endpoints
    f07 = np.divide(del_ep_y, f05) #Calculating sine of angle between endpoints

    return round(f05, 12), round(f06, 12), round(f07, 12)

### Rf08(Stroke Length) - Sum of all stroke lengths

<b>Stroke length is defined as the distance of a single stroke. Every sketch contains multiple strokes. The sum of all the strokes in a sketch is the Total Stroke Length</b><br>

Let's say, the points of a sketch are ($x_{0}$, $y_{0}$), ($x_{1}$, $y_{1}$), ... , ($x_{n-1}$, $y_{n-1}$)<br>
Stroke length is the distance between any adjacent points.

&#8710;$x_{i}$ = $x_{i} - x_{i-1}$<br>
&#8710;$y_{i}$ = $y_{i} - y_{i-1}$<br>

Stroke Length ($s_{i}$) = &#8730;((&#8710;$y_{i}$)&#178; + (&#8710;$x_{i}$)&#178;) <br>
So, Sum of all Stroke Length = $\sum \limits _{i=1} ^{n-1} s_{i}$

In [5]:
def calc_f08(data):
    """
    This function calculates Rf08
    Rf08 is the sum of length of all the strokes in a sketch

    The calculation starts from the 2nd point and moves to the last point while calculating the distance between their previous
    points.

    :param data: Pandas Dataframe containing all the sketch details of x,y and t
    :type data: Pandas Dataframe
    :return: Rf08
    :rtype: Double rounded to 12 digits after decimal places
    """
    f08 = 0
    rows, columns = data.shape

    for i in range(1, rows):
        del_x = data.iloc[i, 0] - data.iloc[i-1, 0]
        del_y = data.iloc[i, 1] - data.iloc[i-1, 1]
        f08 += sqrt(np.square(del_x) + np.square(del_y))
    return round(f08, 12)

### Rf09(Total relative rotation), Rf10(Total absolute rotation), Rf11(Total squared rotation)

Let's say, the points of a sketch are ($x_{0}$, $y_{0}$), ($x_{1}$, $y_{1}$), ... , ($x_{n-1}$, $y_{n-1}$)<br>

<b>Total Relative Rotation is the sum of all angles between each stroke as respect to its previous stroke. It is also understood as the total angle traversed.</b><br>
Let, &#920;$_{i}$ be the angle between $i^{th}$ and $i-1^{th}$ stroke<br>
Let, the $i^{th}$ stroke be from ($x_{i-1}$, $y_{i-1}$) and ($x_{i}$, $y_{i}$) <br>
and the $i-1^{th}$ stroke be from ($x_{i-2}$, $y_{i-2}$) and ($x_{i-1}$, $y_{i-1}$) <br>
Then, <br>
&#8710;$y_{i}$ = $y_{i}-y_{i-1}$ <br>
&#8710;$x_{i}$ = $x_{i}-x_{i-1}$ <br>

So, &#920;$_{i}$ = $arctan$((&#8710;$x_{i}$ * &#8710;$y_{i-1}$ - &#8710;$y_{i}$ * &#8710;$x_{i-1}$)/(&#8710;$x_{i}$ * &#8710;$x_{i-i}$ + &#8710;$y_{i}$ * &#8710;$y_{i-1}$))<br>

Then, Rf09 = $\sum \limits _{i=2} ^{n-1}$&#920;$_{i}$

<b>Total Absolute Rotation is the sum of all absolute angles between strokes. It can also be understood as the total movement done during a sketch.</b><br>
Similarly, Rf10 = $\sum \limits _{i=2} ^{n-1}\lvert$&#920;$_{i}\rvert$<br>

<b>Total Squared Rotation is the sum of square of all absolute angles between strokes. It can also be understood as the sharpness of a sketch. It amplifies the sudden change in angle as it squares the angle.</b><br>
So if a shape has sharp corners, the total value of Total Squared Ritation will be higher than a shape with smooth corners.<br>
Hence similarly, Rf11 = $\sum \limits _{i=2} ^{n-1}\lvert$&#920;$_{i}\rvert^{2}$<br>

<b><i>Precaution:</b></i><br>
Finding the angle involves finding the `arctan` of the angle between the x-axis and the bounding box diagonal. However, the `arctan` is defined only within the 1st and 4th quadrant, i.e. (-&#928;/2, &#928;/2). So, to make it defined in all the quadrant, i.e. (-&#928;, &#928;), we have to use `atan2`.

In [6]:
def calc_f09_f10_f11(data):
    """
    This function calculates Rf09, Rf10, and Rf11
    Rf09 is the total relative rotation
    Rf10 is the total absolute rotation
    Rf11 is the total squared rotation

    The calculation starts from the 3rd point and moves to the last point while calculating the relative angles between their previous
    strokes. We need atleast 3 points to calculate the change in angle. Hence any sketch with less than 3 points will have no relative
    angles.

    :param data: Pandas Dataframe containing all the sketch details of x,y and t
    :type data: Pandas Dataframe
    :return: Rf09, Rf10, and Rf11
    :rtype: Double rounded to 12 digits after decimal places
    """
    f09 = 0
    f10 = 0
    f11 = 0
    rows, columns = data.shape

    for i in range(2,rows):
        del_xi = data.iloc[i, 0] - data.iloc[i-1, 0]
        del_xiprev = data.iloc[i-1, 0] - data.iloc[i-2, 0]
        del_yi = data.iloc[i, 1] - data.iloc[i-1, 1]
        del_yiprev = data.iloc[i-1, 1] - data.iloc[i-2, 1]

        theta = atan2((del_yiprev*del_xi - del_yi*del_xiprev), (del_xi*del_xiprev + del_yi*del_yiprev)) #Updated formulae for screen coordinates

        f09 += theta
        f10 += abs(theta)
        f11 += np.square(theta)

    return round(f09, 12), round(f10, 12), round(f11, 12)


### Rf12(Maximum speed squared)

<b>Speed is the distance travelled in a defined time interval.</b><br>
Let's say, the points of a sketch are ($x_{0}$, $y_{0}$), ($x_{1}$, $y_{1}$), ... , ($x_{n-1}$, $y_{n-1}$) taken at $t_{0}$, $t_{1}$, ... , $t_{n-1}$ timestamps respectively<br>
Then let,<br>
&#8710;$y_{i}$ = $y_{i}-y_{i-1}$ <br>
&#8710;$x_{i}$ = $x_{i}-x_{i-1}$ <br>
&#8710;$t_{i}$ = $t_{i}-t_{i-1}$ <br>

Then, the speed between ($x_{i-1}$, $y_{i-1}$) and ($x_{i}$, $y_{i}$) is given by<br>
$v_{i}$ = &#8730;((&#8710;$y_{i}$)&#178; + (&#8710;$x_{i}$)&#178;)/&#8710;$t_{i}$

So, Rf12 = $\max \limits _{i=1} ^{n-1} v_{i}$

<b><i>Precaution:</b></i><br>
The calculation of speed involves the difference in time in the denominator. With very high sampling rate, we can face that two points could be on the same time stamp. For such case, we need to ignore the point and move ahead with the next point.

In [7]:
def calc_f12(data):
    """
    This function calculates Rf12
    Rf12 is the Maximum of the speed squared from all teh strokes

    The calculation starts from the 2nd point and moves to the last point finding the max of the squared speed.
    However, the calculation of speed involves the difference in time in the denominator. With very high sampling rate,
    we can face that two points could be on the same time stamp. For such case, we need to ignore the point and move ahead
    with the next point.

    :param data: Pandas Dataframe containing all the sketch details of x,y and t
    :type data: Pandas Dataframe
    :return: Rf12
    :rtype: Double rounded to 12 digits after decimal places
    """
    rows, columns = data.shape
    speed_sq = []

    for i in range(1, rows):

        del_t = data.iloc[i, 2] - data.iloc[i-1, 2]
        if del_t<=0: #If there is no time interval between two points
            print("Sampling Rate error")
            continue #We ignore the point and move onto the next point

        del_x = data.iloc[i, 0] - data.iloc[i-1, 0]
        del_y = data.iloc[i, 1] - data.iloc[i-1, 1]
        speed_sq.append(np.divide((np.square(del_x) + np.square(del_y)), np.square(del_t)))
    
    f12 = max(speed_sq)
    return round(f12, 12)


### Rf13(Total time taken)

<b>Total time taken is simply the total time taken to draw the sketch</b>
Let, the points of a sketch are ($x_{0}$, $y_{0}$), ($x_{1}$, $y_{1}$), ... , ($x_{n-1}$, $y_{n-1}$) taken at $t_{0}$, $t_{1}$, ... , $t_{n-1}$ timestamps respectively<br>

So, Rf13 = $t_{n-1}$ - $t_{0}$

In [8]:
def calc_f13(data):
    """
    This function calculates Rf13
    Rf13 is the Total time taken to draw the entire sketch

    :param data: Pandas Dataframe containing all the sketch details of x,y and t
    :type data: Pandas Dataframe
    :return: Rf13
    :rtype: Double rounded to 12 digits after decimal places
    """
    f13 = data.iloc[-1,2] - data.iloc[0,2]
    return round(f13, 12)

In [9]:
def calc_rubine_features(sketch_name, data, rubine_features_dict):
    """
    Helper Function
    ---------------
    This function calls all the individual functions to calculate the rubine features.
    This writes them into a dictionary, rubine_features_dict - which is our final output. This contains the features of all
    sketches in our dataset.

    :param sketch_name: Name of the sketch file
    :type sketch_name: string
    :param data: Pandas dataframe related to the sketch
    :type data: Pandas Dataframe
    :param rubine_features_dict: Output dictionary which contains all the features for all the sketch
    :type rubine_features_dict: Python dict
    :return: Output dictionary which contains all the features for all the sketch
    :rtype: Python dict
    """

    rubine_features_dict['sketch'].append(sketch_name)

    f01, f02 = calc_f01_f02(data)
    rubine_features_dict['f01'].append(f01)
    rubine_features_dict['f02'].append(f02)

    f03, f04 = calc_f03_f04(data)
    rubine_features_dict['f03'].append(f03)
    rubine_features_dict['f04'].append(f04)

    f05, f06, f07 = calc_f05_f06_f07(data)
    rubine_features_dict['f05'].append(f05)
    rubine_features_dict['f06'].append(f06)
    rubine_features_dict['f07'].append(f07)

    f08 = calc_f08(data)
    rubine_features_dict['f08'].append(f08)

    f09, f10, f11 = calc_f09_f10_f11(data)
    rubine_features_dict['f09'].append(f09)
    rubine_features_dict['f10'].append(f10)
    rubine_features_dict['f11'].append(f11)

    f12 = calc_f12(data)
    rubine_features_dict['f12'].append(f12)

    f13 = calc_f13(data)
    rubine_features_dict['f13'].append(f13)

    return rubine_features_dict





In [10]:
def rubine_features(data_folder):
    """
    Helper function
    ---------------
    This function reads individual sketch from the folder path.
    It also creates a dict which will be used to create a pandas dataframe and eventually to features.csv
    The dict is maintained in order to store all the features for every sketch.

    :param data_folder: Path to the data folder
    :type data_folder: string
    :return: Output dictionary which contains all the features for all the sketch
    :rtype: Python dict
    """

    rubine_features_dict = {
            'sketch':[],
            'f01':[], 'f02': [], 'f03': [], 'f04':[], 'f05':[], 'f06':[],
            'f07':[], 'f08': [], 'f09': [], 'f10':[], 'f11':[], 'f12':[], 'f13':[]
    }

    data_folder = "./data/letters-csv/"
    for letter_folder in os.listdir(data_folder):
        for letter_sketch in os.listdir(data_folder+letter_folder):
            # print(letter_sketch)
            data = pd.read_csv(data_folder+letter_folder+'/'+letter_sketch)
            sketch_name = letter_sketch.split('.')[0]
            print(sketch_name)
            rubine_features_dict = calc_rubine_features(sketch_name, data, rubine_features_dict)
    
    return rubine_features_dict


#### Creates the output features.csv of the rubine features from all the sketches 

In [11]:
data_folder = "./data/letters-csv/"
all_rubine_features = rubine_features(data_folder)
data_results = pd.DataFrame.from_dict(all_rubine_features) #Creates the dataframe from the dict
data_results.head(8)
data_results.to_csv('features.csv', index = False) #Writes the dataframe to features.csv

a_1
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
a_10
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
a_11
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
a_12
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
a_13
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
a_14
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
a_15
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
Sampling Rate error
a_16
Sampling Rate error
S