In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


import optuna

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paranjaysoni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
train_df = pd.read_csv('Dataset/twitter_training.csv', names=['id','entity','sentiment','Tweet'])
test_df = pd.read_csv('Dataset/twitter_test.csv', names=['id','entity','sentiment','Tweet'])

In [4]:
train_df.head()

Unnamed: 0,id,entity,sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
train_df.isnull().sum()

id             0
entity         0
sentiment      0
Tweet        686
dtype: int64

In [6]:
test_df.isnull().sum()

id           0
entity       0
sentiment    0
Tweet        0
dtype: int64

In [7]:
train_df.dropna(inplace=True)

In [8]:
train_df['sentiment'].value_counts()

sentiment
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

In [9]:
test_df['sentiment'].value_counts()

sentiment
Neutral       285
Positive      277
Negative      266
Irrelevant    172
Name: count, dtype: int64

### Processing Tweets:

In [10]:
port_stem = PorterStemmer()

In [11]:
def stemming(text):
    FinalText = re.sub('[^a-zA-z]',' ', text)
    FinalText = FinalText.lower()
    FinalText = FinalText.split()
    FinalText = [port_stem.stem(word) for word in FinalText if not word in stopwords.words('english')]
    FinalText = ' '.join(FinalText)

    return FinalText

In [12]:
train_df['Tweet'] = train_df['Tweet'].apply(lambda x:stemming(x))
test_df['Tweet'] = test_df['Tweet'].apply(lambda x:stemming(x))

In [13]:
x_train = train_df['Tweet']
x_test = test_df['Tweet']

In [14]:
vectorizer = TfidfVectorizer()

In [15]:
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

### Processing Output

In [16]:
le = LabelEncoder()

In [17]:
y_train = le.fit_transform(train_df['sentiment'])
y_test = le.transform(test_df['sentiment'])

In [18]:
#Showing Results:
print("Class Mapping:", dict(zip(le.classes_, range(len(le.classes_)))))

Class Mapping: {'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3}


# Working With Model Training:

### Performing Optuna for best Model and Best HyperParametes:

In [19]:
# def objective(trial):
#     model_type = trial.suggest_categorical("model", ["logreg", "svm"])

#     if model_type == "logreg":
#         C = trial.suggest_loguniform("C", 1e-3, 1e2)
#         solver = trial.suggest_categorical("solver", ["lbfgs", "saga"])
#         penalty = trial.suggest_categorical("penalty", ["l2"])
#         max_iter = trial.suggest_int("max_iter", 500, 2000)

#         model = LogisticRegression(
#             C=C,
#             solver=solver,
#             penalty=penalty,
#             multi_class='multinomial',
#             max_iter=max_iter,
#             n_jobs=-1
#         )

#     elif model_type == "svm":
#         C = trial.suggest_loguniform("C", 1e-3, 1e2)
#         loss = trial.suggest_categorical("loss", ["hinge", "squared_hinge"])
#         max_iter = trial.suggest_int("max_iter", 1000, 5000)

#         model = LinearSVC(
#             C=C,
#             loss=loss,
#             max_iter=max_iter
#         )

#     model.fit(x_train, y_train)
#     y_pred = model.predict(x_test)
#     acc = accuracy_score(y_test, y_pred)
#     return acc

# # Run Optuna study
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

# print("\nBest Trial:")
# print(study.best_trial)

# print("\nBest Parameters:")
# print(study.best_params)

# print(f"\nBest Accuracy: {study.best_value:.4f}")


In [20]:
best_model = LogisticRegression(
    C=29.075937392033783,
    solver='saga',
    penalty='l2',
    multi_class='multinomial',
    max_iter=504,
    n_jobs=-1
)

best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
print("Final Accuracy: ", accuracy_score(y_test,y_pred))



Final Accuracy:  0.944


# Testing Sample:

In [21]:
sample_tweets = "After waiting 3 hours in line at the airport, they told me my flight was cancelled without any explanation — I am beyond frustrated and tired right now."

In [22]:
sample_final = [stemming(sample_tweets)]

In [23]:
sample_final = vectorizer.transform(sample_final)

In [24]:
best_model.predict(sample_final)[0]

1

# Exporting Model

In [25]:
import pickle

In [31]:
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))
pickle.dump(le,open('le.pkl','wb'))
pickle.dump(best_model,open('best_model.pkl','wb'))