In [1]:
# Read CSV
import pandas as pd
df = pd.read_csv('sentiment_data.csv')
df.head()


Unnamed: 0,text,label
0,almost got in a giant car accident on the 101,0
1,like something wholly original,1
2,b.s. one another,0
3,"Happy Star Wars Day, may the 4th be with you ...",1
4,few new converts,0


In [2]:
# Cleaning the text (Data Preprocessing)
import re
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r"@\w+", '', text)  # remove mentions
    text = re.sub(r"[^a-zA-Z\s]", '', text)  # remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [3]:
df.head()

Unnamed: 0,text,label,cleaned_text
0,almost got in a giant car accident on the 101,0,almost got in a giant car accident on the
1,like something wholly original,1,like something wholly original
2,b.s. one another,0,bs one another
3,"Happy Star Wars Day, may the 4th be with you ...",1,happy star wars day may the th be with you my ...
4,few new converts,0,few new converts


In [6]:
# Seperate the Dependent Feature and Independent Features
X = df['cleaned_text']
y = df['label']

In [4]:
from sklearn.model_selection import train_test_split

In [8]:
# Split the data into training and testing set (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=32)
X_train.shape, X_test.shape

((66555,), (7396,))

### But we have a problem here 
- Since ML doesn't understand anything except the numbers. 
- But our independent features (X) is in alphabets, or letters.
- So, we need to convert those alphabets into numerical terms or numbers

### For that purpose, we can use TF-IDF (Term Frequency - Inverse Document Frequency)
- It's a way to find out **how important a word is in a document compared to a collection of documents.**
- **TF (Term Frequency)**: How often a word appears in a document. More frequent words in a document get higher TF.
- **IDF (Inverse Document Frequency)**: How unique or rare a word is across all documents. Common words like "the" or "is" get lower IDF, rare words get higher IDF.

**TF-IDF = TF Ã— IDF**

In [9]:
# Import TF-IDF from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [10]:
# Transform the X_train and X_test
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
X_train_tfidf.shape

(66555, 39533)

In [12]:
# Train the LogisticRegression Model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [13]:
# Make prediction on test set
y_pred = model.predict(X_test_tfidf)
y_pred

array([1, 1, 0, ..., 1, 0, 1], shape=(7396,))

In [14]:
# Evaluate the model 
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
clf = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Classification Report:\n", clf)

Confusion Matrix:
 [[2861  455]
 [ 434 3646]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.86      0.87      3316
           1       0.89      0.89      0.89      4080

    accuracy                           0.88      7396
   macro avg       0.88      0.88      0.88      7396
weighted avg       0.88      0.88      0.88      7396



In [21]:
# Lets predict on some raw texts 
my_testing = ['I love harry potter', 'the professer is teaching but bad']


In [19]:
model.predict(tfidf.transform(my_testing))

array([1, 1])

_**We shall export the model and make a simple User Interface using Streamlit** ðŸŽ‰_

main motived of ROC curve

In [24]:
# Lets learn about the ROC Curve 
from sklearn.metrics import roc_curve



In [26]:
import joblib
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']