# IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# LOAD DATASET

In [13]:
df = pd.read_csv("twitter_sentiment.csv", encoding="latin1")
df.head()


Unnamed: 0,Tweet ID,Topic,Sentiment (Psitive/Negative/Neutral),Tweet Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


# RENAME COLUMNS

In [15]:
df = df.iloc[:, :4]
df.columns = ["tweet_id", "topic", "sentiment", "tweet"]
df.head()


Unnamed: 0,tweet_id,topic,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


# CONVERTING SENTIMENTS INTO NUMBERS

In [17]:
df["sentiment"] = df["sentiment"].map({
    "Negative": 0,
    "Neutral": 1,
    "Positive": 2
})


In [19]:
df["sentiment"].value_counts()


0.0    22542
2.0    20832
1.0    18318
Name: sentiment, dtype: int64

# REMOVING MISSING VALUES

In [21]:
df.isnull().sum()


tweet_id         0
topic            0
sentiment    12990
tweet          686
dtype: int64

In [23]:
df = df.dropna(subset=["tweet", "sentiment"])


# TEXT CLEANING

In [25]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+|#", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return text

df["clean_tweet"] = df["tweet"].apply(clean_text)


In [27]:
df[["tweet", "clean_tweet"]].head()


Unnamed: 0,tweet,clean_tweet
0,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...
1,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
2,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all
3,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder y...


# DATA SPLIT (TRAIN & TEST)

In [29]:
X = df["clean_tweet"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# TF-IDF (TEXT --> NUMBERS)

In [31]:
tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


# MODEL TRAINING

In [33]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


# MODEL TESTING

In [35]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7559100204498977
              precision    recall  f1-score   support

         0.0       0.78      0.81      0.79      4427
         1.0       0.73      0.68      0.70      3678
         2.0       0.75      0.77      0.76      4120

    accuracy                           0.76     12225
   macro avg       0.75      0.75      0.75     12225
weighted avg       0.76      0.76      0.76     12225



In [37]:
df["predicted_sentiment"] = model.predict(
    tfidf.transform(df["clean_tweet"])
)

df.to_csv("twitter_sentiment_predictions.csv", index=False)
print("File saved successfully")


File saved successfully
