## Importing neccessary modules

In [1]:
import pandas as pd
import re
from time import time
import string
import json
import numpy as np
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MultiLabelBinarizer

from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score,confusion_matrix,classification_report
import warnings
warnings.filterwarnings('ignore')

## Task1

In [2]:
train=pd.read_csv(r"K:\Desktop\NIIT\Challenge\Course_13\DS3_C3_S2_TweetTrain_Data_Challenge.csv")
test=pd.read_csv(r"K:\Desktop\NIIT\Challenge\Course_13\DS3_C3_S2_TweetTest_Data_Challenge.csv")

train.text.fillna(train.text.mode()[0],inplace=True)
train.selected_text.fillna(train.selected_text.mode()[0],inplace=True)
train.isnull().sum()

textID              0
text                0
selected_text       0
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km�)     0
Density (P/Km�)     0
dtype: int64

In [3]:
test.isnull().sum()

textID              0
text                0
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km�)     0
Density (P/Km�)     0
dtype: int64

## Task2

In [4]:
stopwords_set = set(stopwords.words('english'))

def preprocess_df(df, column_name, stopwords_set):
    new_df = df.copy()  # create a copy of the original DataFrame
    new_df[column_name] = new_df[column_name].apply(lambda x: x.lower())  # convert text to lowercase
    new_df[column_name] = new_df[column_name].apply(lambda x: re.sub(r'[^\w\s]', '', x))  # remove punctuations
    new_df[column_name] = new_df[column_name].apply(lambda x: " ".join([word.lower() for word in re.findall(r'\w+', x) if word.lower() not in stopwords_set]))  # remove stop words
    return new_df

train_clean=preprocess_df(train,'text',stopwords_set)
test_clean=preprocess_df(test,'text',stopwords_set)

## Task3

In [5]:
vectorizer=TfidfVectorizer(stop_words=stopwords.words('english'))
trainvec=vectorizer.fit_transform(train_clean.text)
testvec=vectorizer.transform(test_clean.text)

In [6]:
model=LogisticRegression(max_iter=10)
classifier=OneVsRestClassifier(model)
classifier.fit(trainvec,train.sentiment)
ypred=classifier.predict(testvec)

## Task4

In [7]:
matrix=confusion_matrix(ypred,test.sentiment)
report=classification_report(test.sentiment, ypred)
table=pd.DataFrame({'Ypred':ypred,'Ytest':test.sentiment})

In [8]:
print('Classification report')
print(report)

print('Confusion Matrix')

print()
print(matrix)


table[:30]

Classification report
              precision    recall  f1-score   support

    negative       0.70      0.52      0.59      1001
     neutral       0.58      0.78      0.67      1430
    positive       0.80      0.64      0.71      1103

    accuracy                           0.66      3534
   macro avg       0.69      0.65      0.66      3534
weighted avg       0.68      0.66      0.66      3534

Confusion Matrix

[[ 520  180   47]
 [ 442 1110  347]
 [  39  140  709]]


Unnamed: 0,Ypred,Ytest
0,neutral,neutral
1,positive,positive
2,neutral,negative
3,positive,positive
4,negative,positive
5,positive,positive
6,neutral,negative
7,negative,negative
8,neutral,neutral
9,neutral,neutral


## Interpretation:
The classification report shows the precision, recall, and F1-score of a multi-class classification model for each class (negative, neutral, positive), as well as the overall accuracy and macro/micro average of the three metrics.

In this case, the model's overall accuracy is 0.66, which means that it correctly classified 66% of the instances in the test set. The macro-average F1-score is 0.66, which is the average of the F1-scores of the three classes. The precision, recall, and F1-score for each class show that the model performs better in classifying positive tweets, with the highest precision, recall, and F1-score. The model performs worse in classifying negative tweets, with the lowest precision, recall, and F1-score.