# Library imports

In [1]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = ConvergenceWarning)

NameError: name 'ConvergenceWarning' is not defined

# Data read

In [None]:
data = pd.read_csv("data/sentiment.tsv", sep = "\t")
labelColumns = ["label", "tweet"]
data.columns = labelColumns
data.head()

# Data preprocessing

In [None]:
le = LabelEncoder()
labelLabelEncoded = "label_label_encoded"
data[labelLabelEncoded] = le.fit_transform(data["label"])
data = data[[labelColumns[0]] + [labelLabelEncoded] + labelColumns[1:]]
data.head()

# Cleaning of Data

In [None]:
def remove_pattern(text, pattern):
    r = re.findall(pattern, text)
    for i in r:
        text = re.sub(i, '', text)
    return text

## Removing of twitter handles (@user)

In [None]:
data["clean_tweet"] = np.vectorize(remove_pattern)(data["tweet"], '^\w{1}$')
data["clean_tweet"] = np.vectorize(remove_pattern)(data["clean_tweet"], '@[\w]*')
data.head()

## Removing html tags

In [None]:
data["clean_tweet"] = np.vectorize(remove_pattern)(data["clean_tweet"], '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
data.head()

## Removing special characters, numbers, punctiations

In [None]:
data["clean_tweet"] = data["clean_tweet"].str.replace("[^a-zA-Z#]", " ")
data.head()

## Tokenization of the tweets

In [None]:
tokenized_tweet = data["clean_tweet"].apply(lambda x: x.split())
tokenized_tweet.head()

In [None]:
stemmer = PorterStemmer()
# lemmatizer = WordNetLemmatizer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
# or
# tokenized_tweet = tokenized_tweet.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
tokenized_tweet.head()

In [None]:
tokenized_tweet = tokenized_tweet.apply(lambda x: ' '.join(x))
tokenized_tweet.head()

In [None]:
data["tokenized_tweet"] = tokenized_tweet
data.head()

## Adding other columns
* length of the tweet
* punctiation

In [None]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])

    return round(count / (len(text) - text.count(" ")), 3) * 100

def count_uppercase(text):
    r = re.findall("[A-Z]", text)

    return round(len(r) / (len(text) - text.count(" ")), 3) * 100

data["length"] = data["tweet"].apply(lambda x: len(x) - x.count(" "))
data["uppercase%"] = data["tweet"].apply(lambda x: count_uppercase(x))
data["punctuation%"] = data["tweet"].apply(lambda x: count_punctuation(x))

# Generate the word cloud

In [None]:
all_words = ' '.join(data['tokenized_tweet'])
negative_words = ' '.join(data['tokenized_tweet'][data['label_label_encoded']==0])
positive_words = ' '.join(data['tokenized_tweet'][data['label_label_encoded']==1])
wordcloud = WordCloud(height=800, width=800, random_state=0).generate(all_words)
plt.imshow(wordcloud, interpolation='bilinear')
# plt.figure(figsize=(10, 10))
plt.axis('off')
plt.show()

# Engineering

## The CountVectorizer method

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorised_tweet = count_vectorizer.fit_transform(data['tokenized_tweet'])
X_count_feat = pd.concat([data['length'], data['punctuation%'], data['uppercase%'], pd.DataFrame(count_vectorised_tweet.toarray())], axis = 1)
X_count_feat.head()

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
tfidf_vectorized_tweet = tfidf_vectorizer.fit_transform(data['tokenized_tweet'])
X_tfidf_feat = pd.concat([data['length'], data['punctuation%'], data['uppercase%'], pd.DataFrame(tfidf_vectorized_tweet.toarray())], axis = 1)
X_tfidf_feat.head()

# Data scientism

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', SVC()))
max_score = 0
for name, classifier in models:
    score = cross_val_score(classifier, X_count_feat, data['label_label_encoded'], scoring='accuracy', cv=10).mean()
    print('The model %s has the accuracy of %f'%(name, score))
    if score > max_score:
        max_score = score
print('The model %s has the best accuracy with a score of %f'%(name, max_score))