In [None]:
import pandas as pd
import re

In [None]:
data_path = 'train.csv'

In [None]:
data = pd.read_csv(data_path, encoding = "ISO-8859-1")
data.head(5)

In [None]:
len(data)

## Data Preprocessing

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [None]:
def Preprocessing(text):
    # stripping quotes at end of text
    text=text.strip('')

    # removing twitter handles @user
    text=re.sub("@[\w]*"," ",text)

    # removing URLs with the space
    text = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' ', text)

    # stripping space, " and ' from text
    text = text.strip(' "\'')

    # removing repeated letters of string such as yessss to yes 
    text=re.sub(r'(.)\1{3,}', r'\1', text)

    # removing the special characters
    text = re.sub('[^A-Za-z]', ' ', text)

    # replacing two or more dots with space
    text = re.sub("\\.{2,}"," ",text)

    # converting all text into small letters and storing them as words for further processing
    text_list = text.lower().split()

    # stemming the words (removing prefix and postfix) using Porter Stemming algorithm..
    text_list = [ps.stem(word) for word in text_list]
    
    return ' '.join(text_list)

In [None]:
data['Preprocessed_data']=data['SentimentText'].apply(Preprocessing)

In [None]:
data.head()

# Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.20)

In [None]:
len(train), len(test)

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_vectorization = CountVectorizer()
tfidf_transformation = TfidfTransformer()

In [None]:
X_train_count_vector = count_vectorization.fit_transform(train["Preprocessed_data"])
X_train_tfidf_vector = tfidf_transformation.fit_transform(X_train_count_vector)

X_test_count_vector = count_vectorization.transform(test["Preprocessed_data"])
X_test_tfidf_vector = tfidf_transformation.transform(X_test_count_vector)

y_train = train['Sentiment']
y_test = test['Sentiment']

In [None]:
print(train["Preprocessed_data"])

# Training and Testing Models

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [None]:
model_prediction = dict()

In [None]:
# SVM - Stochastic Gradient Descent
model = SGDClassifier(max_iter=1000, tol=1e-3, loss="modified_huber").fit(X_train_tfidf_vector, y_train)
model_prediction['SVM-SGD'] = model.predict(X_test_tfidf_vector)

# Multinomial Naive Bayes
model = MultinomialNB().fit(X_train_tfidf_vector, y_train)
model_prediction['Multinomial'] = model.predict(X_test_tfidf_vector)


# Bernoulli Naive Bayes
model = BernoulliNB().fit(X_train_tfidf_vector, y_train)
model_prediction['Bernoulli'] = model.predict(X_test_tfidf_vector)


# Logistic Regression
model = LogisticRegression(C=1).fit(X_train_tfidf_vector, y_train)
model_prediction['Logistic'] = model.predict(X_test_tfidf_vector)

In [None]:
# # SVM - Support Vector Classifier
# model = SVC(gamma='auto', C=1).fit(X_train_tfidf_vector, y_train)
# model_prediction['SVM'] = model.predict(X_test_tfidf_vector)

In [None]:
# Decision Trees
model = DecisionTreeClassifier().fit(X_train_tfidf_vector, y_train)
model_prediction['Decision Tree'] = model.predict(X_test_tfidf_vector)


# Votting LR-SGD
clf1 = LogisticRegression(C=1)
clf2 = SGDClassifier(max_iter=1000, tol=1e-3, loss="modified_huber")
model = VotingClassifier(estimators=[('LR', clf1),('SGD', clf2)],voting='soft').fit(X_train_tfidf_vector, y_train)
model_prediction['Votting-LR-SGD'] = model.predict(X_test_tfidf_vector)

In [None]:
model_prediction

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print("ACCURACY SCORE:\n")
for model_name in model_prediction.keys():
  print(model_name, ': ', round(accuracy_score(y_test, model_prediction[model_name])*100, 2))

In [None]:
print("F1 SCORE:\n")
for model_name in model_prediction.keys():
  print(model_name, ': ', round(f1_score(y_test, model_prediction[model_name]), 2))

In [None]:
from sklearn.metrics import precision_recall_fscore_support
for model_name in model_prediction.keys():
        pred = precision_recall_fscore_support(y_test,model_prediction[model_name],average='macro')

        print(f"\n{model_name}|{' '*(17- len(model_name))}   precision         recall          Fscore")
        print('                      ',round(pred[0]*100,2), end= '             ')
        print(round(pred[1]*100,2), end= '           ')
        print(round(pred[2]*100,2))

# Confusion matrix

In [None]:
# Confusion Matrix for Logistic Regression
import seaborn as sns

conf_mat = confusion_matrix(y_test, model_prediction['Logistic'])
axes = ["Negative", "Positive"]
confusion_matrix_df = pd.DataFrame(conf_mat, axes, axes, dtype=int)
sns.heatmap(confusion_matrix_df, annot=True, fmt="d")
plt.title("Confustion Matrix for Logistic Regression")
plt.show()