In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df = pd.read_csv('../judge-1377884607_tweet_product_company.csv', encoding='ISO-8859-1')


In [3]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [5]:
df.shape

(9093, 3)

In [6]:
def clean_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return ''  # Return empty string if text is not a string
    # Remove URLs items
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    text = re.sub(r'\@\w+|\#','', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z']", ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english')) | {'sxsw','link','rt'}
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

# Clean the tweet_text column
df['cleaned_tweet_text'] = df['tweet_text'].apply(clean_text)

In [7]:
df['cleaned_tweet_text'].head(30)

0     g iphone hrs tweeting rise austin dead need up...
1     know awesome ipad iphone app likely appreciate...
2                                   wait ipad also sale
3         hope year's festival crashy year's iphone app
4     great stuff fri marissa mayer google tim o'rei...
5     new ipad apps speechtherapy communication show...
6                                                      
7     starting ctia around corner googleio hop skip ...
8     beautifully smart simple idea wrote hollergram...
9     counting days plus strong canadian dollar mean...
10    excited meet show sprint galaxy still running ...
11    find amp start impromptu parties can't wait ti...
12    foursquare ups game time still prefer far best...
13    gotta love google calendar featuring top parti...
14                                       great ipad app
15               haha awesomely rad ipad app hollergram
16                holler gram ipad itunes app store via
17    noticed dst coming weekend many iphone use

In [14]:
# Map textual sentiment labels to numerical values
sentiment_mapping = {
    "Positive emotion": 1,
    "Negative emotion": -1,
    "No emotion": 0,
    "I can't tell": None  # You might choose to exclude these or handle them differently
}

# Apply the mapping to your sentiment label column
df['sentiment_label'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].map(sentiment_mapping)

# Drop rows where 'sentiment_label' is NaN
df = df.dropna(subset=['sentiment_label'])

# Check the first few rows to ensure the mapping is applied correctly
print(df[['is_there_an_emotion_directed_at_a_brand_or_product', 'sentiment_label']].head())

  is_there_an_emotion_directed_at_a_brand_or_product  sentiment_label
0                                   Negative emotion             -1.0
1                                   Positive emotion              1.0
2                                   Positive emotion              1.0
3                                   Negative emotion             -1.0
4                                   Positive emotion              1.0


In [15]:
# Assuming 'clean_text' is your preprocessing function and 'tweet_text' is the column to preprocess
df['cleaned_text'] = df['tweet_text'].apply(clean_text)

# Drop rows with NaN values in 'sentiment_label' after mapping but before vectorization
df = df.dropna(subset=['sentiment_label'])

# Now vectorize your cleaned text

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(df['cleaned_text'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['tweet_text'].apply(clean_text)


In [16]:
df['cleaned_tweet_text'].head(10)

0     g iphone hrs tweeting rise austin dead need up...
1     know awesome ipad iphone app likely appreciate...
2                                   wait ipad also sale
3         hope year's festival crashy year's iphone app
4     great stuff fri marissa mayer google tim o'rei...
7     starting ctia around corner googleio hop skip ...
8     beautifully smart simple idea wrote hollergram...
9     counting days plus strong canadian dollar mean...
10    excited meet show sprint galaxy still running ...
11    find amp start impromptu parties can't wait ti...
Name: cleaned_tweet_text, dtype: object

In [17]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


lemmatizer = WordNetLemmatizer()

 
def lemmatize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Rejoin lemmatized tokens into a string
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

# Example text
example_text = "Fall Out Boy Rules."

# Lemmatize the example text
# Apply lemmatization to the DataFrame column
df['lemmatized_text'] = df['cleaned_tweet_text'].apply(lemmatize_text)

# Print the first few rows of the 'lemmatized_text' column to verify the output
print(df['lemmatized_text'].head())


0    g iphone hr tweeting rise austin dead need upg...
1    know awesome ipad iphone app likely appreciate...
2                                  wait ipad also sale
3      hope year 's festival crashy year 's iphone app
4    great stuff fri marissa mayer google tim o'rei...
Name: lemmatized_text, dtype: object


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
count_vect = CountVectorizer()

# Fit and transform the 'lemmatized_text' column
X_counts = count_vect.fit_transform(df['lemmatized_text'])

# Convert the result to a DataFrame to view the token counts
count_vect_df = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names_out())
print(count_vect_df.head())


   aapl  abacus  abandoned  aber  ability  able  abroad  absolute  absolutely  \
0     0       0          0     0        0     0       0         0           0   
1     0       0          0     0        0     0       0         0           0   
2     0       0          0     0        0     0       0         0           0   
3     0       0          0     0        0     0       0         0           0   
4     0       0          0     0        0     0       0         0           0   

   abt  ...  zimride  zing  zip  zite  zms  zombie  zomg  zone  zoom  zzzs  
0    0  ...        0     0    0     0    0       0     0     0     0     0  
1    0  ...        0     0    0     0    0       0     0     0     0     0  
2    0  ...        0     0    0     0    0       0     0     0     0     0  
3    0  ...        0     0    0     0    0       0     0     0     0     0  
4    0  ...        0     0    0     0    0       0     0     0     0     0  

[5 rows x 5205 columns]


In [19]:
# Sum up the counts for each word across all documents
word_counts = count_vect_df.sum(axis=0)

# Sort the word counts in descending order to get the most frequent words
top_15_words = word_counts.sort_values(ascending=False).head(15)

# Display the top 25 words and their counts
print(top_15_words)


ipad       1460
apple      1059
google      887
iphone      717
quot        639
store       609
app         460
new         403
austin      325
amp         233
android     233
pop         231
get         206
launch      196
one         174
dtype: int64


In [20]:

X_train, X_test, y_train, y_test = train_test_split(
    X_counts, 
    df['sentiment_label'], 
    test_size=0.2,  # 80% training and 20% testing
    random_state=42  # Ensures a reproducible split
)


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

#initialize model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8633802816901408
              precision    recall  f1-score   support

        -1.0       0.66      0.33      0.44       115
         1.0       0.88      0.97      0.92       595

    accuracy                           0.86       710
   macro avg       0.77      0.65      0.68       710
weighted avg       0.85      0.86      0.84       710



In [22]:
from imblearn.over_sampling import RandomOverSampler

# Initialize the RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Resample the dataset
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Now X_resampled and y_resampled have balanced classes


In [23]:


# Initialize the model with class weights
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Train the model
model.fit(X_train, y_train)

# Continue with prediction and evaluation...


In [24]:
# Predict on the test set
y_pred = model.predict(X_test)


In [25]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Generate a classification report
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

# Generate and display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.8507042253521127
              precision    recall  f1-score   support

    Negative       0.53      0.60      0.57       115
    Positive       0.92      0.90      0.91       595

    accuracy                           0.85       710
   macro avg       0.73      0.75      0.74       710
weighted avg       0.86      0.85      0.85       710

Confusion Matrix:
 [[ 69  46]
 [ 60 535]]


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vect = TfidfVectorizer()

# Apply TF-IDF to the 'cleaned_text' column, then split into training and testing sets
X_tfidf = tfidf_vect.fit_transform(df['cleaned_text'])

# Splitting the dataset into training and testing sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, 
    df['sentiment_label'], 
    test_size=0.2, 
    random_state=42
)


In [27]:
# Initialize the model with class weights to handle imbalance
model_tfidf = LogisticRegression(max_iter=1000, class_weight='balanced')

# Train the model using the TF-IDF features
model_tfidf.fit(X_train_tfidf, y_train)

# Predict on the testing set
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)


In [28]:
# Calculate accuracy and other metrics for the new model
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f"Accuracy with TF-IDF: {accuracy_tfidf}")

# Generate a classification report
print(classification_report(y_test, y_pred_tfidf, target_names=['Negative', 'Positive']))

# Generate and display confusion matrix
conf_matrix_tfidf = confusion_matrix(y_test, y_pred_tfidf)
print("Confusion Matrix with TF-IDF:\n", conf_matrix_tfidf)


Accuracy with TF-IDF: 0.847887323943662
              precision    recall  f1-score   support

    Negative       0.53      0.62      0.57       115
    Positive       0.92      0.89      0.91       595

    accuracy                           0.85       710
   macro avg       0.72      0.75      0.74       710
weighted avg       0.86      0.85      0.85       710

Confusion Matrix with TF-IDF:
 [[ 71  44]
 [ 64 531]]


In [29]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Norm used in the penalization
    'solver': ['liblinear', 'saga']  # Algorithms to use in the optimization problem
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(
    LogisticRegression(max_iter=10000, class_weight='balanced'),
    param_grid,
    scoring='accuracy',  # You can choose other scoring metrics if accuracy is not your sole focus
    cv=5,  # Number of folds in cross-validation
    verbose=1  # Higher number gives more verbose output
)

# Fit the grid search to the data
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits




Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'saga'}
Best Score: 0.8643378791266116


In [30]:
best_model = LogisticRegression(
    max_iter=10000,  # Consider increasing this number
    C=grid_search.best_params_['C'],
    penalty=grid_search.best_params_['penalty'],
    solver=grid_search.best_params_['solver'],
    class_weight='balanced'
)


In [31]:
# Fit the model to your training data
best_model.fit(X_train_tfidf, y_train)


In [32]:
# Predict on the test set
y_pred = best_model.predict(X_test_tfidf)


In [33]:


# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Generate and print a classification report to see precision, recall, and F1 score
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

# Generate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.8676056338028169
              precision    recall  f1-score   support

    Negative       0.64      0.43      0.51       115
    Positive       0.90      0.95      0.92       595

    accuracy                           0.87       710
   macro avg       0.77      0.69      0.72       710
weighted avg       0.85      0.87      0.86       710

Confusion Matrix:
 [[ 49  66]
 [ 28 567]]
