In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pathlib
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
# Loading the dataset 
data_dir = pathlib.Path('/home/lv11/Documents/ProyectosPython/sentimentAnalysis/train')
nf = pd.read_csv(data_dir / 'tweetsDataset1.csv',skiprows=1,names=['Message','Target'])
print(nf.head(20))

Message  Target
0                                             Hi Guys       1
1             it's just got 3 years longer unhappy          0
2   Hey thanks for being top new followers this we...       1
3                                 I know how you feel       0
4                                          huh? happy       1
5                             Didn't keep it unhappy        0
6                                These are for  happy       1
7   unhappy  how come people like this have childr...       0
8                                  Bapak Tom Cruise.        0
9   help me too unhappy  as an early birthday gift...       0
10  Shoutout to for the donation. Looks like I'm s...       1
11                             Good luck Swampy happy       1
12  lets agree she put her brain on knees crying w...       1
13                                         Nitpicking       1
14                 unhappy  :/ unhappy   feeling sick       0
15                      which is good for him happy.1 

In [4]:
nlp = English()
stop_words = list(STOP_WORDS)
print(stop_words)

['beside', 'anyway', 'besides', 'whereby', 'therefore', 'meanwhile', 'perhaps', 'make', 'empty', 'really', 'whereupon', 'three', 'well', 'even', 'hers', 'on', 'onto', '’s', 'elsewhere', 'here', '’ve', 'after', 'than', 'thereafter', 'whereas', 'did', 'twenty', 'wherever', 'more', 'yours', 'were', '’re', '‘ve', 'to', 'this', "'ll", "'d", 'eleven', 'became', 'in', 'another', 'ourselves', 'throughout', 'whence', 'sixty', 'off', 'thru', 'seems', 'forty', 'seem', 'thereby', 'if', 'whither', 'fifteen', 'latter', 'yourselves', 'behind', 'top', 'fifty', 'one', 'most', 'yet', 'put', 'done', 'does', 'once', 'please', 'herself', 'sometimes', 'those', 'nobody', 'has', 'how', 'give', 'such', 'indeed', 'call', 'six', 'full', 'she', 'they', 'whenever', 'somewhere', 'twelve', 'much', 'whom', '‘s', 'however', 'so', 'thus', 'n‘t', 'therein', 'cannot', 'n’t', 'upon', 'why', '’d', 'as', 'by', 'themselves', 'doing', 'many', 'moreover', 'since', 'anyhow', 'itself', 'either', 'namely', 'towards', 'a', 'unless

In [5]:
def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in tokens ]
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuation ]
    return tokens

In [6]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(nf['Message'])
vectorizer.vocabulary_

{'Hi': 660,
 'Guys': 601,
 'it': 2926,
 'just': 2974,
 'got': 2680,
 'years': 4533,
 'longer': 3123,
 'unhappy': 4322,
 'Hey': 659,
 'thanks': 4172,
 'for': 2573,
 'being': 1844,
 'top': 4237,
 'new': 3328,
 'followers': 2567,
 'this': 4195,
 'week': 4439,
 'Much': 960,
 'appreciated': 1733,
 'happy': 2737,
 'know': 3023,
 'how': 2827,
 'you': 4541,
 'feel': 2505,
 'huh': 2831,
 'Didn': 432,
 'keep': 2991,
 'These': 1407,
 'are': 1736,
 'come': 2098,
 'people': 3471,
 'like': 3090,
 'have': 2749,
 'children': 2048,
 'where': 4455,
 'the': 4175,
 'state': 4026,
 'intervention': 2908,
 'Bapak': 247,
 'Tom': 1428,
 'Cruise': 386,
 'help': 2776,
 'me': 3196,
 'too': 4234,
 'as': 1755,
 'an': 1690,
 'early': 2362,
 'birthday': 1873,
 'gift': 2644,
 'huhu': 2832,
 'Shoutout': 1273,
 'to': 4221,
 'donation': 2309,
 'Looks': 854,
 'streaming': 4052,
 'some': 3963,
 'Stanley': 1308,
 'Parable': 1080,
 'later': 3047,
 'Good': 579,
 'luck': 3145,
 'Swampy': 1344,
 'lets': 3079,
 'agree': 1653,
 '

In [7]:
vectorizer.transform(nf['Message']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
messages = nf['Message']
labels = nf['Target']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(messages, labels, test_size=0.25, random_state=1000, shuffle=True)

In [10]:
vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)
X_train

<1727x3749 sparse matrix of type '<class 'numpy.int64'>'
	with 15512 stored elements in Compressed Sparse Row format>

In [11]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
result = classifier.score(X_test, y_test)
print('Accuracy: ', result)

Accuracy:  0.9184027777777778


In [14]:
vec = vectorizer.transform(["That play was boring and stupid but it was good tough","that's the dumbiest idea ever","you're not the brighest but I can manage it"])
some = classifier.predict(vec)

In [15]:
some

array([0, 1, 0])