# AI Blitz 9 - Sentiment Detection

In [1]:
import os

# data analysis
import numpy as np
import pandas as pd

# read from S3
import boto3
import io

# text analysis
import spacy

# scikit learn
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

2022-01-07 16:07:34.505571: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-07 16:07:34.505640: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
!python -m spacy download en_core_web_sm # Downloaing the model for engligh language will contains many pretrained preprocessing pipelines

2021-12-29 11:05:22.066054: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-29 11:05:22.066083: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Read data from S3

In [2]:
s3 = boto3.client(
    's3',
    aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
) 

In [3]:
bucket = 'filedropforprojects'
file_path = 'aicrowdnlp/sentiment/'

def read_from_S3(file_name, file_path=file_path, bucket=bucket):
    obj = s3.get_object(Bucket= bucket, Key= file_path+file_name) 
    return pd.read_csv(io.BytesIO(obj['Body'].read()))

In [5]:
df_train = read_from_S3('train.csv')
print(df_train.shape)
df_train.head()

(31255, 2)


Unnamed: 0,text,label
0,takes no time to copy/paste a press release,0
1,You're delusional,1
2,Jazz fan here. I completely feel. Lindsay Mann...,0
3,ah i was also confused but i think they mean f...,0
4,Thank you so much. ♥️ that means a lot.,0


In [6]:
df_val = read_from_S3('val.csv')
print(df_val.shape)
df_val.head()

(3473, 2)


Unnamed: 0,text,label
0,While I agree with my political views could be...,0
1,im still starving,1
2,*Hey just noticed..* it's your **2nd Cakeday**...,0
3,They just did. Check out the sticky post.,0
4,"I hope so too, she deserves it.",0


In [7]:
df_test = read_from_S3('test.csv')
print(df_test.shape)
df_test.head()

(8682, 2)


Unnamed: 0,text,label
0,I was already over the edge with Cassie Zamora...,0
1,I think you're right. She has oodles of cash a...,0
2,Haha I love this. I used to give mine phone bo...,1
3,Probably out of desperation as they going no a...,0
4,Sorry !! You’re real good at that!!,0


## Data processing using word2vec

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
# Getting a sample text from training dataset to demonstrate word2vec  
sample_text = df_train.iloc[3]['text'] 
sample_text

'ah i was also confused but i think they mean friends around the same age'

In [10]:
# Inputting the text in nlp function
doc = nlp(sample_text)

# Getting the embeddings from the sample text
doc.vector

array([-0.15878154, -0.03677491, -0.26570484,  0.153825  ,  0.5224229 ,
        0.2477987 ,  0.22043146, -0.08919676,  0.15389247, -0.13799287,
        0.03572906, -0.14947385, -0.08140312, -0.4520209 ,  0.1583632 ,
       -0.33586153,  0.25967005,  0.04424061,  0.31546462,  0.17561674,
       -0.09352725, -0.33157822, -0.21478768,  0.3185982 , -0.31043687,
       -0.35127166,  0.13590854,  0.01204349,  0.11739865, -0.13809504,
       -0.01827029, -0.5766309 , -0.1536432 ,  0.10616783, -0.07159047,
       -0.69067687, -0.22970244, -0.06416092,  0.2803262 , -0.15795292,
       -0.48734984, -0.34739298,  0.22527488, -0.01436723,  0.14623967,
       -0.06568522,  0.23915103, -0.2348073 ,  0.09553125,  0.01283267,
        0.30198038,  0.7464748 ,  0.02582642, -0.40294746, -0.34426636,
        0.33271903, -0.23377565, -0.11946503,  0.15855373, -0.15903643,
       -0.07973281, -0.17950998,  0.20564462,  0.02248472,  0.38529328,
       -0.25225443,  0.3150023 , -0.43472373,  0.03292338, -0.22

In [12]:
def create_data(dataset, is_train=True):
    # Getting all text into a python list
    texts = list(dataset['text'].values)
                 
    # Put the list into the nlp pipeline and converting the output into a list
    preprocessed_texts = list(nlp.pipe(texts))

    # Getting vectors for all texts 
    X = [string.vector  for string in preprocessed_texts]


    if is_train:
        # Labels for the corrosponding texts 
        y = dataset['label'].tolist()

        return X, y

    else:
        return X

In [13]:
# Creating the training dataset
X_train, y_train = create_data(df_train)

# Creating the validation dataset
X_val, y_val = create_data(df_val)

## Training model

In [14]:
clf = XGBClassifier(n_estimators=25, max_depth=7, random_state=42)

In [15]:
clf = clf.fit(X_train, y_train)





## Validation

In [16]:
pred_val = clf.predict(X_val)

In [17]:
# Getting F1 & Accuracy score of validation predictions
f1 = f1_score(y_val, pred_val)
accuracy = accuracy_score(y_val, pred_val)

print(f"Validation F1 Score  : {f1} and Accuracy Score {accuracy}")

Validation F1 Score  : 0.06615776081424936 and Accuracy Score 0.7886553412035704


## Submitting results

In [29]:
# By settings is_train=False, the create_data function will only output the features as setuped in the function
X_test = create_data(df_test, is_train=False)
pred_test = clf.predict(X_test)

In [30]:
# Applying the predictions to the labels column of the sample submission 
df_test['label'] = pred_test
df_test.head()

Unnamed: 0,text,label
0,I was already over the edge with Cassie Zamora...,0
1,I think you're right. She has oodles of cash a...,0
2,Haha I love this. I used to give mine phone bo...,0
3,Probably out of desperation as they going no a...,0
4,Sorry !! You’re real good at that!!,0


In [2]:
# loging in to aicrowd
API_KEY = os.environ.get('AICROWD_API_KEY') # Please get your your API Key from [https://www.aicrowd.com/participants/me]
!aicrowd login --api-key $API_KEY

[32mAPI Key valid[0m
[32mSaved API Key successfully![0m


In [3]:
df_test = pd.read_csv('submission.csv')
df_test.head()

Unnamed: 0,text,label
0,I was already over the edge with Cassie Zamora...,0
1,I think you're right. She has oodles of cash a...,1
2,Haha I love this. I used to give mine phone bo...,0
3,Probably out of desperation as they going no a...,0
4,Sorry !! You’re real good at that!!,0


In [4]:
!mkdir assets

# Saving the sample submission in assets directory
df_test.to_csv(os.path.join("assets", "submission.csv"), index=False)

mkdir: cannot create directory ‘assets’: File exists


In [5]:
!aicrowd notebook submit -c emotion-detection -a assets --no-verify

[31mAn unexpected error occured![0m
HTTPConnectionPool(host='localhost', port=8888): Max retries exceeded with url: /api/sessions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f60edda0c70>: Failed to establish a new connection: [Errno 111] Connection refused'))
To get more information, you can run this command with -v.
To increase level of verbosity, you can go upto -vvvvv
