In [9]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the mobile mood_data.txt data into a DataFrame
with open('mood_data.txt', 'r') as file:
    lines = file.readlines()

data = [line.strip().split(';') for line in lines]
df = pd.DataFrame(data, columns=['Text', 'Emotion'])

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [10]:
# Step 2: Generate tokens and remove punctuations, stop words, and lowercase all rows
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Text'].apply(lambda x: word_tokenize(x))
df['Tokens'] = df['Tokens'].apply(lambda x: [word.lower() for word in x if (word.isalpha() and word not in stop_words)])

# Display the first few rows with tokens
df.head()


Unnamed: 0,Text,Emotion,Tokens
0,i didnt feel humiliated,sadness,"[didnt, feel, humiliated]"
1,i can go from feeling so hopeless to so damned...,sadness,"[go, feeling, hopeless, damned, hopeful, aroun..."
2,im grabbing a minute to post i feel greedy wrong,anger,"[im, grabbing, minute, post, feel, greedy, wrong]"
3,i am ever feeling nostalgic about the fireplac...,love,"[ever, feeling, nostalgic, fireplace, know, st..."
4,i am feeling grouchy,anger,"[feeling, grouchy]"


In [11]:
# Step 3: Join all the tokens as they were before and store them in a new column named "cleaned_text"
df['Cleaned_Text'] = df['Tokens'].apply(lambda x: ' '.join(x))

# Display the first few rows with cleaned text
df.head()


Unnamed: 0,Text,Emotion,Tokens,Cleaned_Text
0,i didnt feel humiliated,sadness,"[didnt, feel, humiliated]",didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,"[go, feeling, hopeless, damned, hopeful, aroun...",go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,anger,"[im, grabbing, minute, post, feel, greedy, wrong]",im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,"[ever, feeling, nostalgic, fireplace, know, st...",ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,anger,"[feeling, grouchy]",feeling grouchy


In [12]:
# Step 4: Remove all single characters, extra space, and special characters
df['Processed_Text'] = df['Cleaned_Text'].apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(word) for word in x.split() if len(word) > 1]))

# Display the first few rows with processed text
df.head()


Unnamed: 0,Text,Emotion,Tokens,Cleaned_Text,Processed_Text
0,i didnt feel humiliated,sadness,"[didnt, feel, humiliated]",didnt feel humiliated,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,"[go, feeling, hopeless, damned, hopeful, aroun...",go feeling hopeless damned hopeful around some...,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,anger,"[im, grabbing, minute, post, feel, greedy, wrong]",im grabbing minute post feel greedy wrong,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,"[ever, feeling, nostalgic, fireplace, know, st...",ever feeling nostalgic fireplace know still pr...,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,anger,"[feeling, grouchy]",feeling grouchy,feeling grouchy


In [13]:
# Step 5: Create a final DataFrame containing dependent variable (emotion) and processed text
final_df = df[['Emotion', 'Processed_Text']]

# Display the first few rows of the final DataFrame
final_df.head()


Unnamed: 0,Emotion,Processed_Text
0,sadness,didnt feel humiliated
1,sadness,go feeling hopeless damned hopeful around some...
2,anger,im grabbing minute post feel greedy wrong
3,love,ever feeling nostalgic fireplace know still pr...
4,anger,feeling grouchy


In [14]:
# Step 6: Extract independent variables (Xs) and dependent variables (Ys) into separate data objects
X = final_df['Processed_Text']
Y = final_df['Emotion']

# Display X and Y
X.head(), Y.head()


(0                                didnt feel humiliated
 1    go feeling hopeless damned hopeful around some...
 2            im grabbing minute post feel greedy wrong
 3    ever feeling nostalgic fireplace know still pr...
 4                                      feeling grouchy
 Name: Processed_Text, dtype: object,
 0    sadness
 1    sadness
 2      anger
 3       love
 4      anger
 Name: Emotion, dtype: object)

In [15]:
# Step 7: Generate tokens and do vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Display the vectorized representation of the processed text
print(X_vectorized)


  (0, 5687)	0.7871275995990267
  (0, 4362)	0.1610350492364661
  (0, 3189)	0.595397224436664
  (1, 837)	0.4170484916806381
  (1, 1706)	0.32307127000452734
  (1, 10971)	0.2797601490698083
  (1, 636)	0.26795366249524677
  (1, 5612)	0.35742662815418186
  (1, 2821)	0.4922829070050744
  (1, 5614)	0.353082697686241
  (1, 4366)	0.11456432921959907
  (1, 5026)	0.2524717248612682
  (2, 13320)	0.40708906062417677
  (2, 5129)	0.41283400202679404
  (2, 8979)	0.364704241061855
  (2, 7501)	0.4148713015794362
  (2, 5074)	0.5613252521089854
  (2, 5801)	0.18970079026227551
  (2, 4362)	0.08744583858937749
  (3, 9224)	0.5412304880943103
  (3, 11279)	0.25688588880993307
  (3, 6591)	0.23614561080135144
  (3, 4472)	0.5545446223057517
  (3, 8035)	0.4015037668131076
  (3, 4045)	0.31737149339929255
  :	:
  (15995, 4362)	0.07224660764258314
  (15996, 11430)	0.4674452543267196
  (15996, 11772)	0.3798371311460169
  (15996, 12917)	0.3167884844776561
  (15996, 2985)	0.3731394167766028
  (15996, 11685)	0.362075439267

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics

# Assuming you have the 'final_df' DataFrame with 'Emotion' and 'Processed_Text' columns
# If not, replace it with your actual DataFrame

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_df['Processed_Text'], final_df['Emotion'], test_size=0.2, random_state=42)

# Step 2: Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Step 3: Train a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Step 4: Evaluate the Multinomial Naive Bayes classifier
nb_predictions = nb_classifier.predict(X_test_vectorized)
nb_accuracy = metrics.accuracy_score(y_test, nb_predictions)
print(f"Multinomial Naive Bayes Accuracy: {nb_accuracy}")

# Step 5: Train a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_vectorized, y_train)

# Step 6: Evaluate the Random Forest classifier
rf_predictions = rf_classifier.predict(X_test_vectorized)
rf_accuracy = metrics.accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")

# Step 7: Train a Random Forest classifier with entropy criterion
rf_entropy_classifier = RandomForestClassifier(criterion='entropy')
rf_entropy_classifier.fit(X_train_vectorized, y_train)

# Step 8: Evaluate the Random Forest classifier with entropy criterion
rf_entropy_predictions = rf_entropy_classifier.predict(X_test_vectorized)
rf_entropy_accuracy = metrics.accuracy_score(y_test, rf_entropy_predictions)
print(f"Random Forest (Entropy) Accuracy: {rf_entropy_accuracy}")

# Step 9: Train a Support Vector Machine (SVM) classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_vectorized, y_train)

# Step 10: Evaluate the SVM classifier
svm_predictions = svm_classifier.predict(X_test_vectorized)
svm_accuracy = metrics.accuracy_score(y_test, svm_predictions)
print(f"SVM Accuracy: {svm_accuracy}")


Multinomial Naive Bayes Accuracy: 0.6634375
Random Forest Accuracy: 0.885
Random Forest (Entropy) Accuracy: 0.8765625
SVM Accuracy: 0.8509375
