# AI Blitz 9 - Sentiment Detection

In [4]:
import os

# data analysis
import numpy as np
import pandas as pd

# text analysis
import spacy

# scikit learn
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

import pickle

from app.tools import create_data

In [2]:
!python -m spacy download en_core_web_sm # Downloaing the model for engligh language will contains many pretrained preprocessing pipelines

/home/neperiana/.cache/pypoetry/virtualenvs/sentimentor-Hdw3AJT1-py3.8/bin/python: No module named spacy


## Read data

In [5]:
df_train = pd.read_csv('data/train.csv')
print(df_train.shape)
df_train.head()

(31255, 2)


Unnamed: 0,text,label
0,takes no time to copy/paste a press release,0
1,You're delusional,1
2,Jazz fan here. I completely feel. Lindsay Mann...,0
3,ah i was also confused but i think they mean f...,0
4,Thank you so much. ♥️ that means a lot.,0


In [6]:
df_val = pd.read_csv('data/val.csv')
print(df_val.shape)
df_val.head()

(3473, 2)


Unnamed: 0,text,label
0,While I agree with my political views could be...,0
1,im still starving,1
2,*Hey just noticed..* it's your **2nd Cakeday**...,0
3,They just did. Check out the sticky post.,0
4,"I hope so too, she deserves it.",0


In [7]:
df_test = pd.read_csv('data/test.csv')
print(df_test.shape)
df_test.head()

(8682, 2)


Unnamed: 0,text,label
0,I was already over the edge with Cassie Zamora...,0
1,I think you're right. She has oodles of cash a...,0
2,Haha I love this. I used to give mine phone bo...,1
3,Probably out of desperation as they going no a...,0
4,Sorry !! You’re real good at that!!,0


## Data processing using word2vec

In [None]:
nlp = spacy.load('en_core_web_sm')

In [9]:
# Getting a sample text from training dataset to demonstrate word2vec  
sample_text = df_train.iloc[3]['text'] 
sample_text

'ah i was also confused but i think they mean friends around the same age'

In [10]:
# Inputting the text in nlp function
doc = nlp(sample_text)

# Getting the embeddings from the sample text
doc.vector

array([-0.15878154, -0.03677491, -0.26570484,  0.153825  ,  0.5224229 ,
        0.2477987 ,  0.22043146, -0.08919676,  0.15389247, -0.13799287,
        0.03572906, -0.14947385, -0.08140312, -0.4520209 ,  0.1583632 ,
       -0.33586153,  0.25967005,  0.04424061,  0.31546462,  0.17561674,
       -0.09352725, -0.33157822, -0.21478768,  0.3185982 , -0.31043687,
       -0.35127166,  0.13590854,  0.01204349,  0.11739865, -0.13809504,
       -0.01827029, -0.5766309 , -0.1536432 ,  0.10616783, -0.07159047,
       -0.69067687, -0.22970244, -0.06416092,  0.2803262 , -0.15795292,
       -0.48734984, -0.34739298,  0.22527488, -0.01436723,  0.14623967,
       -0.06568522,  0.23915103, -0.2348073 ,  0.09553125,  0.01283267,
        0.30198038,  0.7464748 ,  0.02582642, -0.40294746, -0.34426636,
        0.33271903, -0.23377565, -0.11946503,  0.15855373, -0.15903643,
       -0.07973281, -0.17950998,  0.20564462,  0.02248472,  0.38529328,
       -0.25225443,  0.3150023 , -0.43472373,  0.03292338, -0.22

In [12]:
# Creating the training dataset
X_train, y_train = create_data(df_train)

# Creating the validation dataset
X_val, y_val = create_data(df_val)

## Training model

In [34]:
params = {
    'alpha': 1.332478736676892,
    'colsample_bytree': 0.9517609933272821,
    'eta': 0.14374409112885878,
    'eval_metric': 'auc',
    'max_depth': 10,
    'min_child_weight': 9.72576189153768,
    'n_estimators': 48,
    'scale_pos_weight': 3.7812452195196573,
}

In [35]:
clf = XGBClassifier(**params, objective='binary:logistic')

In [36]:
clf = clf.fit(X_train, y_train)

## Validation

In [37]:
pred_val = clf.predict(X_val)

In [38]:
# Getting F1 & Accuracy score of validation predictions
f1 = f1_score(y_val, pred_val)
accuracy = accuracy_score(y_val, pred_val)

print(f"Validation F1 Score  : {f1} and Accuracy Score {accuracy}")

Validation F1 Score  : 0.27813620071684586 and Accuracy Score 0.710048949035416


## Saving model locally

In [47]:
filename = 'sentiment_cgb.pkl'
pickle.dump(clf, open(filename, 'wb'))

## Submitting results

In [39]:
# By settings is_train=False, the create_data function will only output the features as setuped in the function
X_test = create_data(df_test, is_train=False)
pred_test = clf.predict(X_test)

In [40]:
# Applying the predictions to the labels column of the sample submission 
df_test['label'] = pred_test
df_test.head()

Unnamed: 0,text,label
0,I was already over the edge with Cassie Zamora...,0
1,I think you're right. She has oodles of cash a...,1
2,Haha I love this. I used to give mine phone bo...,0
3,Probably out of desperation as they going no a...,0
4,Sorry !! You’re real good at that!!,0


In [41]:
# loging in to aicrowd
API_KEY = os.environ.get('AICROWD_API_KEY') # Please get your your API Key from [https://www.aicrowd.com/participants/me]
!aicrowd login --api-key $API_KEY

[32mAPI Key valid[0m
[32mSaved API Key successfully![0m


In [44]:
!mkdir assets

# Saving the sample submission in assets directory
df_test.to_csv(os.path.join("assets", "submission.csv"), index=False)

In [45]:
!aicrowd notebook submit -c emotion-detection -a assets --no-verify

 %load_ext aicrowd.magic
%aicrowd notebook submit -c emotion-detection -a assets --no-verify
Using notebook: sentiment_analysis.ipynb for submission...
Scrubbing API keys from the notebook...
Collecting notebook...
[2K[1;34msubmission.zip[0m [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━[0m [35m100.0%[0m • [32m291.2/289.5 KB[0m • [31m2.1 MB/s[0m • [36m0:00:00[0m[0m • [36m0:00:01[0m[36m0:00:01[0m
[?25h                                                  ╭─────────────────────────╮                                                  
                                                  │ [1mSuccessfully submitted![0m │                                                  
                                                  ╰─────────────────────────╯                                                  
[3m                                                        Important links                                                        [0m
┌──────────────────┬────────────────────────────────────────