<a href="https://colab.research.google.com/github/onlyabhilash/NLP-Code/blob/main/part-7/TwitterSentiment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook we combine the concepts from previous notebook. We pre-process tweets using our preprocessing pipeline, build embeddings and then classify them using a logistic regression model.

In [None]:
#Making the necessary imports
import os
import sys

preprocessing_path = "/home/etherealenvy/Downloads/practical-nlp/Ch8/O5_smtd_preprocessing.py"
sys.path.append(os.path.abspath(preprocessing_path))

import O5_smtd_preprocessing

from nltk.corpus import stopwords
from string import punctuation

import pandas as pd
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()


#imports related to modeling
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Reading and Preprocessing
Let's read the dataset and pre-process them using the pre-processing pipeline.

In [None]:
datapath = "Path to repo"
df = pd.read_csv(datapath+"/practical-nlp/Ch8/Data/sts_gold_tweet.csv",
                 error_bad_lines=False,delimiter=";")
df = df.dropna(how='any')
df.drop(columns=['id'], inplace=True)
display(df.head())

#pre-process tweets using our package
df['tweet'] = df['tweet'].apply(lambda x: O5_smtd_preprocessing.process_TweetText(x))
df['tweet_tokens'] = df['tweet'].apply(lambda x: tweet_tokenizer.tokenize(x))
df['tweet_no_stopwords'] = df['tweet_tokens'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])
tweets_processed = df['tweet_tokens'].values
tweets_cat = df['polarity'].values

display(df.head())
print("Number of tweets and categories")
print(len(tweets_processed), len(tweets_cat))
print("\nExamle of polarity, processed tweet, processed tweet without stopwords")
print(tweets_cat[0],',',tweets_processed[0],',',df['tweet_no_stopwords'].values[0])

Unnamed: 0,polarity,tweet
0,0,the angel is going to miss the athlete this we...
1,0,It looks as though Shaq is getting traded to C...
2,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
3,0,drinking a McDonalds coffee and not understand...
4,0,So dissapointed Taylor Swift doesnt have a Twi...


Unnamed: 0,polarity,tweet,tweet_tokens,tweet_no_stopwords
0,0,the angel is going to miss the athlete this we...,"[the, angel, is, going, to, miss, the, athlete...","[angel, going, miss, athlete, weekend]"
1,0,it looks as though shaq is getting traded to c...,"[it, looks, as, though, shaq, is, getting, tra...","[looks, though, shaq, getting, traded, clevela..."
2,0,constantnonbrandmention constantdate isn't com...,"[constantnonbrandmention, constantdate, isn't,...","[constantnonbrandmention, constantdate, coming..."
3,0,drinking a mcdonalds coffee and not understand...,"[drinking, a, mcdonalds, coffee, and, not, und...","[drinking, mcdonalds, coffee, understanding, s..."
4,0,so dissapointed taylor swift doesnt have a twi...,"[so, dissapointed, taylor, swift, doesnt, have...","[dissapointed, taylor, swift, doesnt, twitter]"


Number of tweets and categories
2034 2034

Examle of polarity, processed tweet, processed tweet without stopwords
0 , ['the', 'angel', 'is', 'going', 'to', 'miss', 'the', 'athlete', 'this', 'weekend'] , ['angel', 'going', 'miss', 'athlete', 'weekend']


## Train your own Embedding

In [None]:
#CBOW
import time
start = time.time()
w2v_model = Word2Vec(tweets_processed,min_count=5, sg=0)
end = time.time()

print("CBOW Model Training Complete.\nTime taken for training is:{:.5f} sec ".format((end-start)))

CBOW Model Training Complete.
Time taken for training is:0.26048 sec 


In [None]:
#Create document vectors by averaging word vectors.
def embedding_feats(list_of_lists):
    DIMENSION = 100
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for tokens in list_of_lists:
        feat_for_this =  np.zeros(DIMENSION)
        count_for_this = 0
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this +=1
        feats.append(feat_for_this/count_for_this if count_for_this > 0 else feat_for_this)        
    return feats

train_vectors = embedding_feats(df['tweet_no_stopwords'].values)
print(len(train_vectors))

2034


array([-0.11000946,  0.08224008, -0.2457916 , -0.01220685, -0.18601269,
        0.00523998, -0.00507181, -0.07762612, -0.10011366,  0.33908988,
        0.06252642, -0.04345755,  0.12749993, -0.35703743,  0.2398896 ,
       -0.08165025, -0.27893725, -0.08432864, -0.39982322,  0.11275407,
       -0.35310337,  0.04924289, -0.05515452,  0.19547662, -0.08554919,
       -0.15641166, -0.30485598,  0.33278381,  0.17344698, -0.12306982,
        0.14739941, -0.22321756, -0.04895892, -0.08503899,  0.10244402,
       -0.21976046, -0.13981797,  0.03915762,  0.01270108, -0.04367448,
        0.01347027, -0.08552931,  0.04630744,  0.05024158, -0.06494117,
        0.2162953 ,  0.01547404, -0.11982584, -0.12661303,  0.18651646,
       -0.21306013, -0.10556066,  0.07856253,  0.33509017,  0.2425042 ,
        0.07452476, -0.07753238,  0.15359538,  0.16840796, -0.14462264,
        0.22466612,  0.1852351 ,  0.21337978,  0.03906654,  0.28573281,
       -0.27614906, -0.06802826, -0.0617772 ,  0.18794162,  0.11

In [None]:
#Take any classifier (LogisticRegression here)
classifier = LogisticRegression(random_state=2020)
train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, 
                                                                df['polarity'].values)

classifier.fit(train_data, train_cats)
print("Accuracy: ", classifier.score(test_data, test_cats))
preds = classifier.predict(test_data)
print(classification_report(test_cats, preds))

Accuracy:  0.6797642436149313
              precision    recall  f1-score   support

           0       0.68      1.00      0.81       346
           4       0.00      0.00      0.00       163

    accuracy                           0.68       509
   macro avg       0.34      0.50      0.40       509
weighted avg       0.46      0.68      0.55       509

