In [70]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [71]:
dataset = pd.read_csv('data_preprocessing/dataset_with_target.csv', encoding='unicode_escape')
dataset.head()

Unnamed: 0.1,Unnamed: 0,Date,Tweet,Open,Close,inc_dec
0,0,2012-11-16,RT JonFavreau My Model S just arrived and I we...,31.15,31.84,1
1,1,2012-11-19,Just returned from a trip to London and Oxford...,32.07,32.919998,1
2,2,2012-11-20,These articles in Space News describe why Aria...,32.799999,33.0,1
3,3,2012-11-21,Love this picture of the Curiousity rover on M...,32.610001,32.470001,0
4,4,2012-11-23,Liam Neesons Lifes Too Short sketch is super f...,32.599998,32.130001,0


In [72]:
dataset.drop(['Unnamed: 0', 'Open', 'Close'], axis=1, inplace=True)

In [73]:
new_tweet = []
wnl = WordNetLemmatizer()
for i in range(0, 633):
    tweet = re.sub('[^a-zA-Z]', ' ', dataset['Tweet'][i])
    tweet = tweet.lower()
    tweet = tweet.split()
    tweet = [wnl.lemmatize(word) for word in tweet if not word in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    new_tweet.append(tweet)

In [74]:
new_tweet[1]

'returned trip london oxford met many interesting people really like britain'

In [75]:
dataset['new_tweet'] = new_tweet

In [76]:
lengths = []
for twit in dataset['new_tweet']:
    lengths.append(len(twit))
print(lengths[:10])

[76, 75, 443, 211, 133, 213, 157, 22, 207, 57]


In [77]:
len(new_tweet)

633

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(new_tweet).toarray()
X = pd.DataFrame(X)
y = dataset['inc_dec']

In [79]:
X.shape

(633, 7359)

In [81]:
X['len_tweets'] = lengths

In [82]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
## logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [84]:
y_pred = lr.predict(X_test)
y_pred

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)

In [85]:
from sklearn.metrics import classification_report, accuracy_score
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.5590551181102362
              precision    recall  f1-score   support

           0       0.59      0.59      0.59        68
           1       0.53      0.53      0.53        59

    accuracy                           0.56       127
   macro avg       0.56      0.56      0.56       127
weighted avg       0.56      0.56      0.56       127



In [86]:
## XGBoost Classifier
import xgboost
xgbclassifier = xgboost.XGBClassifier()
xgbclassifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [87]:
y_pred = xgbclassifier.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], dtype=int64)

In [88]:
from sklearn.metrics import classification_report, accuracy_score
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.6141732283464567
              precision    recall  f1-score   support

           0       0.62      0.71      0.66        68
           1       0.60      0.51      0.55        59

    accuracy                           0.61       127
   macro avg       0.61      0.61      0.61       127
weighted avg       0.61      0.61      0.61       127



In [89]:
## Naive Bayes
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB()
gauss.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [90]:
y_pred = gauss.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1], dtype=int64)

In [91]:
from sklearn.metrics import classification_report, accuracy_score
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.44881889763779526
              precision    recall  f1-score   support

           0       0.48      0.40      0.44        68
           1       0.42      0.51      0.46        59

    accuracy                           0.45       127
   macro avg       0.45      0.45      0.45       127
weighted avg       0.45      0.45      0.45       127



In [92]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgbclassifier, X, y, cv=5)
print(scores)

[0.54330709 0.5511811  0.55905512 0.51968504 0.6       ]


In [93]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.55 (+/- 0.05)


In [94]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, X, y, cv=5)
print(scores)



[0.47244094 0.51968504 0.54330709 0.48818898 0.576     ]




In [95]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.52 (+/- 0.07)


In [96]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(gauss, X, y, cv=5)
print(scores)

[0.48818898 0.56692913 0.48031496 0.42519685 0.432     ]


In [97]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.48 (+/- 0.10)
