In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Plotting/Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import csv

from textblob import TextBlob
import re

import nltk
# nltk.download()
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
nltk.download('vader_lexicon')

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import pos_tag, pos_tag_sents

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [None]:
# df = pd.read_csv("../input/stockticker10tweets5321-label/stockticker10-tweets(5-3-21) label.csv")
df = pd.read_csv("../input/stockticker10tweets/stockticker10-tweets.csv")
df.head()

# Preprocessing

In [None]:
df.info()

In [None]:
df = df.drop(['Datetime(UTC)','Tweet ID','RT', 'RT Count', 'Fav Count'], axis=1)
df.dropna(subset=['Sentiment (2pos/1neu/0neg)'], inplace=True)
df.head()

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    text = str(text)
    text = text.lower()
    text = text.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    rem_tweet = re.sub('@[^\s]+','',rem_num)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_tweet)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stop]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

df['cleanText'] = df['Text'].apply(preprocess)

In [None]:
df_cleanText = df['cleanText']
df_cleanText

In [None]:
df.head()

# Classification

## VADER

In [None]:
def VADER(sentence):
    VADER_sentence = str(sentence)
    VADER_analyser = SentimentIntensityAnalyzer()
    
    
    score = VADER_analyser.polarity_scores(VADER_sentence)
    VADER_answer = score['compound']
    if(VADER_answer>0):
        return_rating = 2
    elif(VADER_answer<0):
        return_rating = 0
    else:
        return_rating = 1
    return return_rating
    

df['VADER'] = df['Text'].map(lambda s:VADER(s)) 

In [None]:
df

# Comparing VADER against manual label

In [None]:
y_true=pd.Series(df['Sentiment (2pos/1neu/0neg)']).array
y_pred = pd.Series(df['VADER']).array

In [None]:
cm = confusion_matrix(y_true, y_pred)
cm

In [None]:
fig = plt.figure()
sns.heatmap(cm, annot = True, fmt='d')
plt.title("Confusion matrix of VADER vs Manual Label for Twitter")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
accuracy = accuracy_score(y_true, y_pred)
f1Score = f1_score(y_true, y_pred, average='macro')
precisionScore = precision_score(y_true, y_pred, average='macro')
recallScore = recall_score(y_true, y_pred, average='macro')

print("Accuracy: {}%\nF1-score: {}\nPrecision Score: {}\nRecall Score: {}".format(f'{accuracy*100:.2f}', f'{f1Score:.3f}', f'{precisionScore:.3f}', f'{recallScore:.3f}'))

# Naive Bayes Classifier

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df[['cleanText']], 
                                                    df['Sentiment (2pos/1neu/0neg)'], 
                                                    test_size=0.2, 
                                                    random_state=0)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2))
cv = vectorizer.fit_transform(x_train['cleanText'])

In [None]:
clf = MultinomialNB().fit(cv, y_train)

In [None]:
test_vector = vectorizer.transform(x_test['cleanText'])
result = clf.predict(test_vector)
cmNB = metrics.confusion_matrix(y_test, result)
cmNB

In [None]:
fig = plt.figure()
sns.heatmap(cmNB, annot = True, fmt='d')
plt.title("Confusion matrix of Naive Bayes Classification for Twitter")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# count vectorizing with unigram + bigram
accuracy = accuracy_score(y_test, result)
f1Score = f1_score(y_test, result, average='macro')
precisionScore= precision_score(y_test, result, average='macro')
recallScore = recall_score(y_test, result, average='macro')

print("Accuracy: {}%\nF1-score: {}\nPrecision Score: {}\nRecall Score: {}".format(f'{accuracy*100:.2f}', f'{f1Score:.3f}', f'{precisionScore:.3f}', f'{recallScore:.3f}'))

# Ensemble Classification
## Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(bootstrap=False,max_features='sqrt',n_estimators=800)

rfCLF = clf.fit(cv, y_train)

test_vector = vectorizer.transform(x_test['cleanText'])
result = clf.predict(test_vector)
cmRF = metrics.confusion_matrix(y_test, result)
cmRF

In [None]:
fig = plt.figure()
sns.heatmap(cmNB, annot = True, fmt='d')
plt.title("Confusion matrix of Random Forest Classification for Twitter")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# count vectorizing with unigram + bigram
accuracy = accuracy_score(y_test, result)
f1Score = f1_score(y_test, result, average='macro')
precisionScore= precision_score(y_test, result, average='macro')
recallScore = recall_score(y_test, result, average='macro')

print("Accuracy: {}%\nF1-score: {}\nPrecision Score: {}\nRecall Score: {}".format(f'{accuracy*100:.2f}', f'{f1Score:.3f}', f'{precisionScore:.3f}', f'{recallScore:.3f}'))

In [None]:
df.to_csv('twitter_dataset.csv')