<a href="https://colab.research.google.com/github/lov435/SOEmotions/blob/main/Subtype_Prediction_from_GoEmotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Import the required libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

#### Get the dataset csv from google drive

In [None]:
url='https://drive.google.com/file/d/1OW1PZ-MvXFGd4KbqE8zjSakPNVKXXVLy/view?usp=sharing'
file_id=url.split('/')[-2]
dwn_url='https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)

#### Extract the features and classes from the dataframe

In [None]:
X_cols = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'] 
X = df[X_cols]
y = df['Label']

#### Use pandas factorize function to factorize the Group column in the dataset. This will create both factors and the definitions for the factors.

In [None]:
factor = pd.factorize(y)
y = factor[0]
definitions = factor[1]
print(y)
print(definitions)

[0 1 2 ... 3 1 7]
Index(['Solution', 'Clarification', 'Correction', 'Question', 'Performance',
       'Support', 'Request', 'Irrelevant', 'Praise', 'Reference', 'Disagree',
       'Flaw', 'Example', 'Highlight', 'Error', 'Extension', 'Obsolete'],
      dtype='object')


#### Creating the Training and Test set from data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

#### Feature Scaling. We will use a standard scaler provided in the sklearn library. It subtracts the mean value of the observation and then divides it by the unit variance of the observation.

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Fitting Random Forest Classification to the Training set


In [None]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=42)

#### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)

#### Reverse factorize the 'Group' column from numbers to text.

In [None]:
reversefactor = dict(zip(range(definitions.size),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)

#### Create a confusion matrix

In [None]:
print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))


Predicted      Clarification  Correction  Disagree  Error  Example  Flaw  \
Actual                                                                     
Clarification             87           6         2      3        1     7   
Correction                20           0         0      0        1     6   
Disagree                   9           1         6      2        0     3   
Error                     19           2         1      1        0     3   
Example                    9           0         0      1        0     1   
Extension                  3           0         0      0        0     0   
Flaw                      24           0         1      1        0     5   
Highlight                  3           0         0      0        0     0   
Irrelevant                35           3         2      1        0     2   
Obsolete                   5           0         0      0        0     1   
Performance                3           0         0      0        0     1   
Praise      

#### Print accuracy and f1 score

In [None]:
accuracy_score(y_test, y_pred)

0.37089715536105033

In [None]:
f1_score(y_test, y_pred, average=None)

array([0.36786469, 0.        , 0.25531915, 0.03846154, 0.        ,
       0.        , 0.11494253, 0.        , 0.23584906, 0.        ,
       0.11764706, 0.66440678, 0.69964664, 0.29166667, 0.08      ,
       0.09937888, 0.        ])