In [1]:
import numpy as np
import pandas as pd
import timeit
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Tweet Sentiment Prediction
Air Paradis has commissioned an AI product that anticipates bad buzz on social networks. The AI product can predict the sentiment associated with a tweet.
* Data description: information about tweets (user who posted, content, time of posting) and a binary label (tweet expressing a negative sentiment or not). 
* A functional prototype of the model. That sends a tweet and retrieves the sentiment prediction. 

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kazanova/sentiment140?dataset_version_number=2...


100%|██████████| 80.9M/80.9M [00:03<00:00, 25.9MB/s]

Extracting model files...





Path to dataset files: C:\Users\spectre\.cache\kagglehub\datasets\kazanova\sentiment140\versions\2


In [2]:
# 1. Get the file path and read the file
file_path = 'C:/Users/spectre/.cache/kagglehub/datasets/kazanova/sentiment140/versions/2/training.1600000.processed.noemoticon.csv'
# read the datafile
df = pd.read_csv(file_path, 
                sep= ',',
                on_bad_lines='warn',
                parse_dates=True,
                encoding='latin-1',
                header=None,
                engine='python')
# add column titles
df.columns = ['polarity', 'id', 'date', 'query', 'user', 'text']
# select onlt the needed columns
df = df[['polarity', 'text']]

In [3]:
# 2. Covert polarity from 0,4 to 0,1 binary values
df['target'] = df['polarity'].apply(lambda x: 1 if x == 4 else 0)
# count the number of occurance for each sentiment class
print(f'polarity counts: {df.polarity.value_counts()}\nTarget counts: {df.target.value_counts()}')

polarity counts: polarity
0    800000
4    800000
Name: count, dtype: int64
Target counts: target
0    800000
1    800000
Name: count, dtype: int64


# Preprocess the text
Convert text to lower case, remove contractions, urls, usernames, digits, punctuations stopwords, tokenize corpus and lemmatize.

In [None]:
# 3. Preprocess the text 
# transform corpus to lowercase
df['text'] = df['text'].str.lower()

# Change contractions so that tokenization can work right
import contractions
df['text'] = df['text'].apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))

# Remove URLs
df['text'] = df['text'].str.replace(r'http\S+|www.\S+', '', case=False)

# Remove all @s (usernames)
df['text'] = df['text'].replace(r'@\S+', '')

# Remove digits
df['text'] = df['text'].replace(r'\d+', '', regex=True)
    
# Remove Punctuation
df['text'] = df['text'].str.replace('[^\w\s]', '')

In [6]:
# 4. Lemmatize the text
# Lemmatize the corpus
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

# function to get a language detector
def get_lang_detector(nlp, name):
    return LanguageDetector()

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

Language.factory("language_detector", func=get_lang_detector) # load the language detector
nlp.add_pipe('language_detector', last=True)

def lemmatize(text):

   doc = nlp(text)

   tokens = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct or token.is_digit)]

   return ' '.join(tokens)

In [None]:
# Apply the  lemmatize()  function to the whole cleaned corpus with:
df['clean_text'] = df.text.apply(lambda txt : lemmatize(txt))

In [None]:
# 5. Split data into label, features, traing and testing sets
from sklearn.model_selection import train_test_split
X = df.clean_text
y = df.target
# Split X and y into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
print(f'Training text counts: {len(Xtrain)}\nTraining Target counts: {ytrain.value_counts()}\n--------------------------------------\nTesting text counts: {len(Xtest)}\nTesting Target counts: {ytest.value_counts()}')

Training text counts: 1200000
Training Target counts: target
0    600419
1    599581
Name: count, dtype: int64
--------------------------------------
Testing text counts: 400000
Testing Target counts: target
1    200419
0    199581
Name: count, dtype: int64


In [None]:
# 6. Convert the text to a bag-of-words representation
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Xtrain = vectorizer.fit_transform(Xtrain)
Xtest = vectorizer.transform(Xtest)

In [None]:
# 7. Train a logistic regression classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(Xtrain, ytrain)

In [None]:
# Function to predict and evaluate model
def val_test(model, x_train, y_train, x_test, y_test):
    from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
    from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, classification_report, confusion_matrix
    # generate predictions
    y_pred = model.predict(x_test)
    # predicted probabilities
    y_pred_prob = model.predict_proba(x_test)[::, 1]

    # Generate a confusion matrix for the model
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
        )

    # cross validation
    c_v = RepeatedStratifiedKFold(n_splits=10, n_repeats= 3, random_state=13)
    #crossvalidation score
    crosval = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=c_v, n_jobs=-1)

    # Print the classification report for the model
    print("Confusion Matrix")
    display(cm_df)

    print("Classification Report")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy Score : {accuracy_score(y_test, y_pred): .3f}, \
        Recall score: {recall_score(y_test, y_pred): .3f}")
    print(f'Area Under Curve: {roc_auc_score(y_test, model.predict_proba(x_test)[::,1]): .4f}, \
        Cross Validation Score: {crosval.mean():.3f}')

In [None]:
# 8. Make predictions and evaluate model
val_test(classifier, Xtrain, ytrain, Xtest, ytest)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,154644,44937
Actual 1,36536,163883


Classification Report
              precision    recall  f1-score   support

           0       0.81      0.77      0.79    199581
           1       0.78      0.82      0.80    200419

    accuracy                           0.80    400000
   macro avg       0.80      0.80      0.80    400000
weighted avg       0.80      0.80      0.80    400000

Accuracy Score :  0.796,         Recall score:  0.818


NameError: name 'X_test' is not defined

In [None]:
# 9. Apply model to make prediction
import string

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# New data sample
new_data = ["This place was lit."]
new_data_clean = [clean_text(doc) for doc in new_data]  # Cleaning new data

# Making predictions
print("Prediction:", classifier.predict(new_data_clean))

In [None]:
# 7. Supply Tweet and apply model to classify
def classify_tweet(model, vectorizer, tweet):
    message_vect = vectorizer.transform([tweet])
    prediction = model.predict(message_vect)
    return &quot;negative&quot; if prediction[0] == 0 else &quot;positive&quot;

# Example of using the function
message = &quot;This sob thinks they can bulldoze around!&quot;
print(classify_message(classifier, vectorizer, message))

Unnamed: 0,polarity,text
0,0,"[switchfoot, http, twitpic, com, yzl, awww, bu..."
1,0,"[upset, cannot, update, facebook, texting, it,..."
2,0,"[kenichan, dived, many, times, ball, managed, ..."
3,0,"[whole, body, feels, itchy, like, fire]"
4,0,"[nationwideclass, no, behaving, all, mad, here..."
...,...,...
1599995,4,"[woke, up, school, best, feeling, ever]"
1599996,4,"[thewdb, com, cool, hear, old, walt, interview..."
1599997,4,"[ready, mojo, makeover, ask, details]"
1599998,4,"[happy, th, birthday, boo, alll, time, tupac, ..."
