In [4]:
import pandas as pd
import pathlib
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [5]:
# Loading the dataset 
data_dir = pathlib.Path('/home/lv11/Documents/ProyectosPython/sentimentAnalysis/train')
nf = pd.read_csv(data_dir / 'tweetsDataset1.csv',skiprows=1,names=['Message','Target'])


In [6]:
nlp = English()
stop_words = list(STOP_WORDS)
print(STOP_WORDS)

{'it', 'anything', 'above', 'get', 'wherever', 'he', 'than', 'are', 'bottom', 'else', 'whither', 'been', 'she', 'everything', 'almost', 'everywhere', 'have', 'used', 'done', 'what', 'whom', '’re', 'first', 'does', 'became', 'must', 'others', 'see', 'say', 'sometime', 'the', 'two', 'both', 'fifty', 'n‘t', 'seem', 'thru', '‘ve', 'except', 'latter', 'thence', 'who', 'very', 'whoever', 'do', 'fifteen', 'himself', 'much', 'myself', 'by', 'themselves', 'towards', 'that', 'least', 'seemed', 'keep', 'with', 'once', 'next', 'is', 'to', 'toward', 'therefore', 'various', 'even', 'all', 'somewhere', 'thereby', 'thereupon', 'when', 'at', 'but', 'into', 'until', 'someone', 'how', 'nobody', 'serious', 'whose', 'nevertheless', 'him', 'down', 'amongst', 'i', 'three', 'ca', 'ten', '‘re', 'former', "'re", 'wherein', 'over', 'front', 'give', 'against', 'cannot', 'up', 'whatever', 'through', 'hereupon', 'am', 'thus', 'always', 'had', 'if', 'latterly', 'was', 'will', 'or', 'elsewhere', 'really', 'becomes', 

In [7]:
def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in tokens ]
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuation ]
    return tokens

class predictors(TransformerMixin):
    
    def transform(self,x, **transform_params):
        return [ clean_text(text) for text in x ]

    def fit(self, x, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

def clean_text(text):
    return text.strip().lower()

In [8]:
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer,ngram_range=(1,1))
classifier = LinearSVC(dual=False)
tfvectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)

In [9]:
x = nf['Message']
ylabels = nf['Target']

X_train, X_test, y_train, y_test = train_test_split(x, ylabels, test_size=0.2, random_state=42)

In [10]:
pipe = Pipeline(
    [
        ('cleaner', predictors()),
        ('vectorizer', vectorizer),
        ('classifier', classifier)
    ]
)

In [11]:
pipe.fit(X_train, y_train)
test_prediction = pipe.predict(X_test)

In [12]:
for (sample, prediction) in zip(X_test, test_prediction):
    print(sample," PREDICTION ====> ", prediction)

I miss you unhappy   PREDICTION ====>  0
Thanks for being top engaged community members this week happy  Want this  PREDICTION ====>  1
I agree. My issue would be that it all has to be paid for somehow. I just can't see the numbers adding up. sad   PREDICTION ====>  0
this is so sad what the fuck unhappy   PREDICTION ====>  0
The video will definitely be a 30 minute episode if not longer.  PREDICTION ====>  1
I only just absorbed that I'm not going to get to laugh when he gets 2000 votes. unhappy  .3  PREDICTION ====>  0
Thanks for the recent follow Happy to connect happy  have a great Thursday. Want this.1  PREDICTION ====>  1
I love you so much Rui-chan smile  smile  PREDICTION ====>  1
 Keep  PREDICTION ====>  1
 this is the main reason I drink Total selflessness :D  PREDICTION ====>  1
Koalas are dying of thirst  and it's all because of us unhappy  .12  PREDICTION ====>  0
Hellooo happy  KT Bear (FFABear)  PREDICTION ====>  1
 you don't ever have to thank me J hunt  PREDICTION ====

In [13]:
print("Accuracy test: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,test_prediction))

print("Accuracy train: ",pipe.score(X_train,y_train))

Accuracy test:  0.9067245119305857
Accuracy:  1.0
Accuracy train:  0.99457111834962


In [17]:
tweet = ["That play was boring and stupid but it was good tough","that's the dumbiest idea ever","you're not the brighest but I can manage it"]
print(pipe.predict(tweet))

[1 0 1]
