In [1]:
import pandas as pd
import pathlib
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

# Classifiers 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Loading the dataset 
data_dir = pathlib.Path('/home/lv11/Documents/ProyectosPython/sentimentAnalysis/train')
nf = pd.read_csv(data_dir / 'tweetsDataset1.csv',skiprows=1,names=['Message','Target'])
len(nf)

2303

In [3]:
nlp = English()
stop_words = list(STOP_WORDS)
print(STOP_WORDS)

{'each', 'regarding', 'somewhere', 'while', 'did', 'down', 'empty', 'besides', 'yourselves', 'has', "'d", 'her', 'seemed', 'latter', 'have', 'behind', 'three', 'onto', 'myself', 'none', 'that', 'them', 'latterly', 'my', 'what', 'forty', 'serious', "'re", 'wherever', 'five', 'rather', 'through', 'n’t', 'we', 'after', '’d', 'seems', 'less', 'anyhow', 'go', 'out', 'get', 'sixty', 'there', 'yourself', 'move', 'do', 'same', 'nevertheless', 'almost', 'whereafter', 'four', 'thus', 'with', 'across', 'together', 'towards', 'first', 'two', 'both', 'can', 'few', 'nothing', 'your', 'everyone', 'might', 'but', 'meanwhile', 'n‘t', 'those', 'himself', 'however', 'here', 'so', 'fifty', 'am', 'all', 'been', 'whom', 'if', 'yours', 'too', 'full', 'least', 'because', 'hereupon', 'nobody', 'whenever', 'hundred', '‘s', 'about', 'indeed', 'perhaps', 'themselves', 'anywhere', 'say', 'amount', 'somehow', 'herself', 'will', 'any', 'well', 'other', 'much', "'ve", 'alone', 'doing', '’s', 'due', 'as', 'became', 'h

In [4]:
def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in tokens ]
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuation ]
    return tokens

class predictors(TransformerMixin):
    
    def transform(self,x, **transform_params):
        return [ clean_text(text) for text in x ]

    def fit(self, x, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

def clean_text(text):
    return text.strip().lower()

In [5]:
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer,ngram_range=(1,1))
# uncomment the lines below to test other classifiers
classifier = MultinomialNB()  #RandomForestClassifier(n_estimators=200, random_state=42)  #LogisticRegression() #LinearSVC(dual=False)
tfvectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)

In [7]:
x = nf['Message']
ylabels = nf['Target']

X_train, X_test, y_train, y_test = train_test_split(x, ylabels, test_size=0.2, random_state=42, shuffle=True)

In [47]:
pipeSVM = Pipeline(
    [
        ('cleaner',predictors()),
        ('vectorizer',vectorizer),
        ('classifier', LinearSVC(dual=False))
    ]
)

pipeLogisticRegresion = Pipeline(
    [
        ('cleaner',predictors()),
        ('vectorizer',vectorizer),
        ('classifier', LogisticRegression())
    ]
)

pipeRFC = Pipeline(
    [
        ('cleaner',predictors()),
        ('vectorizer',vectorizer),
        ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
    ]
)

pipeNaiveNayes = Pipeline(
    [
        ('cleaner',predictors()),
        ('vectorizer',vectorizer),
        ('classifier', MultinomialNB())
    ]
)

pipe = pipeSVM

In [48]:
pipe.fit(X_train, y_train)
test_prediction = pipe.predict(X_test)

In [49]:
for (sample, prediction) in zip(X_test, test_prediction):
    print(sample," PREDICTION ====> ", prediction)

   PREDICTION ====>  0
Not really an amount just loads of sections unhappy  I'll ride up to to netherton now meet me at Darby end ?  PREDICTION ====>  0
True! Khilado kuch unhappy   PREDICTION ====>  0
Pls RT[NCT FIC] Love Song  PREDICTION ====>  1
I used to like my neighbours  PREDICTION ====>  1
 I always know when it come but have no fucking clue how it goes unhappy    PREDICTION ====>  0
Ah awesome! Pixel art stuff by any chance? happy  PREDICTION ====>  1
Don't think I'm pretty enough unhappy   PREDICTION ====>  0
 I'm so glad I have you   PREDICTION ====>  1
If you can't be there for someone at their worst then you don't deserve to be there at their best :)KISSES TheFashionIcon  PREDICTION ====>  1
.the person who chose shut the fuck up: unhappy   PREDICTION ====>  0
 get a dog. :D  PREDICTION ====>  1
- Thank you tons  PREDICTION ====>  1
 or SHIT for short :)*Thanks to my friend maddy over at  PREDICTION ====>  1
 no sign of Garry Barlow yet sad   PREDICTION ====>  0
 thanks so

In [53]:
print("Accuracy test: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,test_prediction))
print("Accuracy train: ",pipe.score(X_train,y_train))

print("Confusion matrix ---")
print(confusion_matrix(y_test, test_prediction))
print("Classification report ---")
print(classification_report(y_test, test_prediction))
print(accuracy_score(y_test, test_prediction))



Accuracy test:  0.9067245119305857
Accuracy:  1.0
Accuracy train:  0.99457111834962
Confusion matrix ---
[[198  25]
 [ 18 220]]
Classification report ---
              precision    recall  f1-score   support

           0       0.92      0.89      0.90       223
           1       0.90      0.92      0.91       238

    accuracy                           0.91       461
   macro avg       0.91      0.91      0.91       461
weighted avg       0.91      0.91      0.91       461

0.9067245119305857


In [59]:
tweet = ["That play was boring and stupid but it was good tough","that's the dumbiest idea ever","you're not the brighest but I can manage it"]
print(pipe.predict(tweet))

[1 0 1 1]
