In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read data
tsv_file_train = 'train.tsv.zip'
csv_table = pd.read_table(tsv_file_train, sep='\t')
csv_table.to_csv('train.csv', index=False)

tsv_file_test = 'test.tsv.zip'
csv_table = pd.read_table(tsv_file_test, sep='\t')
csv_table.to_csv('test.csv', index=False)

In [None]:
# Read dataframe
data = pd.read_csv('/train.csv')
df_test = pd.read_csv('/test.csv')

print(f'Data train:\n\n{data[:10]}')
print(f'Data test:\n\n{df_test[:10]}')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['Sentiment'].unique()

In [None]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [None]:
stopwords_set = set(stopwords.words('english'))
# Function to preprocess text
emoji_pattern = re.compile(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)')

def preprocessing(text):
    if not isinstance(text, str):
        return ''
    emojis = emoji_pattern.findall(text)
    text = re.sub(r'[\W+]', ' ', text.lower()) + ' '.join(emojis).replace('-', '')
    return ' '.join([word for word in text.split() if word not in stopwords_set])

In [None]:
data['Phrase'] = data['Phrase'].apply(lambda x: preprocessing(x))
df_test['Phrase'] = df_test['Phrase'].apply(lambda x: preprocessing(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf=TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,use_idf=True,norm='l2',smooth_idf=True)
tfidf = TfidfVectorizer()
y=data.Sentiment
X=tfidf.fit_transform(data.Phrase)
test = tfidf.transform(df_test.Phrase)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.5,shuffle=False)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.5,shuffle=False)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
print(classification_report(y_test, y_pred))

In [None]:
pred = model.predict(test)

In [None]:
df_test.head()

In [None]:
import pickle
with open ('model_pickle', 'wb') as f:
    pickle.dump(model, f)

In [None]:
with open('model_pickle', 'rb') as f:
    mp = pickle.load(f)
mp.predict(test)