In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pathlib

import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import accessing_published_tweets, cleaningTweets

In [16]:
# Loading the dataset 
data_dir = pathlib.Path('/home/lv11/Documents/ProyectosPython/sentimentAnalysis/train')
nf = pd.read_csv(data_dir / 'tweetsDataset1.csv',skiprows=1,names=['Message','Target'])
#print(nf.head(20))

In [17]:
nlp = English()
stop_words = list(STOP_WORDS)
#print(stop_words)

In [18]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(nf['Message'])
#vectorizer.vocabulary_

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=0,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [19]:
vectorizer.transform(nf['Message']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
messages = nf['Message']
labels = nf['Target']

In [21]:
x_train, x_test, y_train, y_test = train_test_split(messages, labels, test_size=0.25, random_state=1000, shuffle=True)

In [22]:
vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)
X_train

<1727x3749 sparse matrix of type '<class 'numpy.int64'>'
	with 15512 stored elements in Compressed Sparse Row format>

In [23]:
classifier = LinearSVC(dual=False) #LinearSVC(dual=False) #MultinomialNB() # RandomForestClassifier(n_estimators=200) #LogisticRegression()
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
test_prediction = classifier.predict(X_test)

for (sample, prediction) in zip(x_test, test_prediction):
    print(sample," PREDICTION ====> ", prediction)

jimin to yoongi unhappy  .4  PREDICTION ====>  0
realized i havent eaten rice maghapon because i finished 3 packs of spicy seafood noodles + to  extra big chilimansi :(acid attack right now  PREDICTION ====>  0
clauditte  PREDICTION ====>  0
I used to like my neighbours  PREDICTION ====>  1
 had chopsuey before that unhappy  laughing my ass off humble feeds are always the bomb!  PREDICTION ====>  0
KissWard tagline:tweet with us Pans :)NoOneCanSTOP KissWard  PREDICTION ====>  1
I check Beautiful News every single day  PREDICTION ====>  1
- Thank you tons.1  PREDICTION ====>  1
How can I set an alarm for 9am and snooze it until 12 unhappy   PREDICTION ====>  0
Great! You're welcome Josh happy  ^Adam  PREDICTION ====>  1
32- for the love of a daughter crying    PREDICTION ====>  1
failed unhappy  .g hehe  PREDICTION ====>  0
I miss it already unhappy   PREDICTION ====>  0
My bed is so comfortable I don't want to get up unhappy   PREDICTION ====>  0
I wish my dogs knew how much I hyped th

In [25]:
result = classifier.score(X_test, y_test)
print('Accuracy: ', result)

print("Accuracy test: ",classifier.score(X_test,y_test))
print("Accuracy train: ",classifier.score(X_train,y_train))

print("Confusion matrix ---")
print(confusion_matrix(y_test, test_prediction))
print("Classification report ---")
print(classification_report(y_test, test_prediction))
print(accuracy_score(y_test, test_prediction))

Accuracy:  0.8802083333333334
Accuracy test:  0.8802083333333334
Accuracy train:  1.0
Confusion matrix ---
[[266  22]
 [ 47 241]]
Classification report ---
              precision    recall  f1-score   support

           0       0.85      0.92      0.89       288
           1       0.92      0.84      0.87       288

    accuracy                           0.88       576
   macro avg       0.88      0.88      0.88       576
weighted avg       0.88      0.88      0.88       576

0.8802083333333334


In [26]:
vec = vectorizer.transform(["That play was boring and stupid but it was good tough","that's the dumbiest idea ever","you're not the brighest but I can manage it","i hate the oranges"])
some = classifier.predict(vec)
some

array([0, 1, 0, 1])

In [27]:
user = 'Luckficus'
twitterClient = accessing_published_tweets.TwitterClient(user)
tweetAnalyzer = accessing_published_tweets.TweetAnalyzer()

tweets = twitterClient.get_user_timeline_tweets(13)
tweets = tweetAnalyzer.tweets_to_data_frame(tweets)

tweets['tweets']

0     RT @ornellavezzoso: #comodoro El clima con nos...
1     @liamoreno17 Jsjajaja en ig tengo un bot que l...
2     The sklearn model is performing great! Happy n...
3       There is people being hurt and they don't care.
4     All the notebooks are really out of my reach r...
5     The Crown is really boring, it's like watch th...
6     @AbriltheDuchess No me cancelen, son twts d pr...
7     I wanted to do some platzi courses but I can't...
8                                    @covipex Por favor
9                 Boka juniors is unhappy since 9/12/18
10                     My sis cooked some good brownies
11                   I hate all the Harry Potter movies
12    River tied with Sao Paulo, it was such a great...
Name: tweets, dtype: object

In [28]:
for tweet in tweets['tweets']:
    sentiment = 'Unhappy' if classifier.predict(vectorizer.transform([tweet]).toarray()) == 0 else 'Happy'
    print(tweet,", sentiment: ",sentiment)

RT @ornellavezzoso: #comodoro El clima con nosotros ya tiene algo personal. Confirmadisimo! https://t.co/EjMN3FePx2 , sentiment:  Happy
@liamoreno17 Jsjajaja en ig tengo un bot que likea cosas, aca en tw todavía no tengo , sentiment:  Unhappy
The sklearn model is performing great! Happy news to me. , sentiment:  Happy
There is people being hurt and they don't care. , sentiment:  Happy
All the notebooks are really out of my reach right now. Sad moment :( , sentiment:  Happy
The Crown is really boring, it's like watch the history Channel , sentiment:  Happy
@AbriltheDuchess No me cancelen, son twts d prueba , sentiment:  Unhappy
I wanted to do some platzi courses but I can't. Depressed. Thanks Albert Fernández ahre , sentiment:  Unhappy
@covipex Por favor , sentiment:  Unhappy
Boka juniors is unhappy since 9/12/18 , sentiment:  Unhappy
My sis cooked some good brownies , sentiment:  Unhappy
I hate all the Harry Potter movies , sentiment:  Happy
River tied with Sao Paulo, it was such a gre