In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import string
import textblob
import vaderSentiment
import sklearn

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df = pd.read_csv('confessions_tweets.csv')

In [None]:
print(len(df))
df.head()

3126


Unnamed: 0,Tweet Id,Text,Sentiment,UTC,Created At,Favorites,Retweets,Unnamed: 7
0,1640413566561038338,ras are you in debt,-1.0,2023-03-27T18:00:46.000Z,Mon Mar 27 18:00:46 +0000 2023,2,0,
1,1640412154800267264,guys at this rate of releasing season mixes I'...,0.0,2023-03-27T17:55:09.000Z,Mon Mar 27 17:55:09 +0000 2023,2,0,
2,1640412115277320198,how do I practice this philthy set if they hav...,0.0,2023-03-27T17:55:00.000Z,Mon Mar 27 17:55:00 +0000 2023,0,0,
3,1640412092569358336,O G KUSH WHERE ARE YOUR MIXES,0.0,2023-03-27T17:54:54.000Z,Mon Mar 27 17:54:54 +0000 2023,0,0,
4,1640412074378670080,Is there a reason I can’t drunk watch Rangeelo...,0.0,2023-03-27T17:54:50.000Z,Mon Mar 27 17:54:50 +0000 2023,0,0,


## Pre-processing the dataset

### Punctuation Removal

In [None]:
def remove_punctuation(text):
  new_text = ''.join([word for word in text if word not in string.punctuation])
  return new_text

In [None]:
df['clean_text'] = df['Text'].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,Tweet Id,Text,Sentiment,UTC,Created At,Favorites,Retweets,Unnamed: 7,clean_text
0,1640413566561038338,ras are you in debt,-1.0,2023-03-27T18:00:46.000Z,Mon Mar 27 18:00:46 +0000 2023,2,0,,ras are you in debt
1,1640412154800267264,guys at this rate of releasing season mixes I'...,0.0,2023-03-27T17:55:09.000Z,Mon Mar 27 17:55:09 +0000 2023,2,0,,guys at this rate of releasing season mixes Im...
2,1640412115277320198,how do I practice this philthy set if they hav...,0.0,2023-03-27T17:55:00.000Z,Mon Mar 27 17:55:00 +0000 2023,0,0,,how do I practice this philthy set if they hav...
3,1640412092569358336,O G KUSH WHERE ARE YOUR MIXES,0.0,2023-03-27T17:54:54.000Z,Mon Mar 27 17:54:54 +0000 2023,0,0,,O G KUSH WHERE ARE YOUR MIXES
4,1640412074378670080,Is there a reason I can’t drunk watch Rangeelo...,0.0,2023-03-27T17:54:50.000Z,Mon Mar 27 17:54:50 +0000 2023,0,0,,Is there a reason I can’t drunk watch Rangeelo...


### Lowercase

In [None]:
df['clean_text'] = df['clean_text'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,Tweet Id,Text,Sentiment,UTC,Created At,Favorites,Retweets,Unnamed: 7,clean_text
0,1640413566561038338,ras are you in debt,-1.0,2023-03-27T18:00:46.000Z,Mon Mar 27 18:00:46 +0000 2023,2,0,,ras are you in debt
1,1640412154800267264,guys at this rate of releasing season mixes I'...,0.0,2023-03-27T17:55:09.000Z,Mon Mar 27 17:55:09 +0000 2023,2,0,,guys at this rate of releasing season mixes im...
2,1640412115277320198,how do I practice this philthy set if they hav...,0.0,2023-03-27T17:55:00.000Z,Mon Mar 27 17:55:00 +0000 2023,0,0,,how do i practice this philthy set if they hav...
3,1640412092569358336,O G KUSH WHERE ARE YOUR MIXES,0.0,2023-03-27T17:54:54.000Z,Mon Mar 27 17:54:54 +0000 2023,0,0,,o g kush where are your mixes
4,1640412074378670080,Is there a reason I can’t drunk watch Rangeelo...,0.0,2023-03-27T17:54:50.000Z,Mon Mar 27 17:54:50 +0000 2023,0,0,,is there a reason i can’t drunk watch rangeelo...


### Tokenization

In [None]:
def tokenize(text):
  tokens = nltk.word_tokenize(text)
  return [word for word in tokens if word.isalpha()]

In [None]:
df['tokenized'] = df.apply(lambda x: tokenize(x['clean_text']), axis=1)
df.head()

Unnamed: 0,Tweet Id,Text,Sentiment,UTC,Created At,Favorites,Retweets,Unnamed: 7,clean_text,tokenized
0,1640413566561038338,ras are you in debt,-1.0,2023-03-27T18:00:46.000Z,Mon Mar 27 18:00:46 +0000 2023,2,0,,ras are you in debt,"[ras, are, you, in, debt]"
1,1640412154800267264,guys at this rate of releasing season mixes I'...,0.0,2023-03-27T17:55:09.000Z,Mon Mar 27 17:55:09 +0000 2023,2,0,,guys at this rate of releasing season mixes im...,"[guys, at, this, rate, of, releasing, season, ..."
2,1640412115277320198,how do I practice this philthy set if they hav...,0.0,2023-03-27T17:55:00.000Z,Mon Mar 27 17:55:00 +0000 2023,0,0,,how do i practice this philthy set if they hav...,"[how, do, i, practice, this, philthy, set, if,..."
3,1640412092569358336,O G KUSH WHERE ARE YOUR MIXES,0.0,2023-03-27T17:54:54.000Z,Mon Mar 27 17:54:54 +0000 2023,0,0,,o g kush where are your mixes,"[o, g, kush, where, are, your, mixes]"
4,1640412074378670080,Is there a reason I can’t drunk watch Rangeelo...,0.0,2023-03-27T17:54:50.000Z,Mon Mar 27 17:54:50 +0000 2023,0,0,,is there a reason i can’t drunk watch rangeelo...,"[is, there, a, reason, i, can, t, drunk, watch..."


### Removing Stopwords

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

def remove_stops(text):
  new_text = [word for word in text if word not in stopwords]
  return new_text

In [None]:
df['tokenized'] = df['tokenized'].apply(lambda x: remove_stops(x))
df.head()

Unnamed: 0,Tweet Id,Text,Sentiment,UTC,Created At,Favorites,Retweets,Unnamed: 7,clean_text,tokenized
0,1640413566561038338,ras are you in debt,-1.0,2023-03-27T18:00:46.000Z,Mon Mar 27 18:00:46 +0000 2023,2,0,,ras are you in debt,"[ras, debt]"
1,1640412154800267264,guys at this rate of releasing season mixes I'...,0.0,2023-03-27T17:55:09.000Z,Mon Mar 27 17:55:09 +0000 2023,2,0,,guys at this rate of releasing season mixes im...,"[guys, rate, releasing, season, mixes, im, gon..."
2,1640412115277320198,how do I practice this philthy set if they hav...,0.0,2023-03-27T17:55:00.000Z,Mon Mar 27 17:55:00 +0000 2023,0,0,,how do i practice this philthy set if they hav...,"[practice, philthy, set, havent, dropped, mix]"
3,1640412092569358336,O G KUSH WHERE ARE YOUR MIXES,0.0,2023-03-27T17:54:54.000Z,Mon Mar 27 17:54:54 +0000 2023,0,0,,o g kush where are your mixes,"[g, kush, mixes]"
4,1640412074378670080,Is there a reason I can’t drunk watch Rangeelo...,0.0,2023-03-27T17:54:50.000Z,Mon Mar 27 17:54:50 +0000 2023,0,0,,is there a reason i can’t drunk watch rangeelo...,"[reason, drunk, watch, rangeelo, videos, beach..."


### Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
  new_text = [lemmatizer.lemmatize(word, pos='v') for word in text]
  return new_text

In [None]:
df['tokenized'] = df['tokenized'].apply(lemmatize_words)
df.head()

Unnamed: 0,Tweet Id,Text,Sentiment,UTC,Created At,Favorites,Retweets,Unnamed: 7,clean_text,tokenized
0,1640413566561038338,ras are you in debt,-1.0,2023-03-27T18:00:46.000Z,Mon Mar 27 18:00:46 +0000 2023,2,0,,ras are you in debt,"[ras, debt]"
1,1640412154800267264,guys at this rate of releasing season mixes I'...,0.0,2023-03-27T17:55:09.000Z,Mon Mar 27 17:55:09 +0000 2023,2,0,,guys at this rate of releasing season mixes im...,"[guy, rate, release, season, mix, im, gon, na,..."
2,1640412115277320198,how do I practice this philthy set if they hav...,0.0,2023-03-27T17:55:00.000Z,Mon Mar 27 17:55:00 +0000 2023,0,0,,how do i practice this philthy set if they hav...,"[practice, philthy, set, havent, drop, mix]"
3,1640412092569358336,O G KUSH WHERE ARE YOUR MIXES,0.0,2023-03-27T17:54:54.000Z,Mon Mar 27 17:54:54 +0000 2023,0,0,,o g kush where are your mixes,"[g, kush, mix]"
4,1640412074378670080,Is there a reason I can’t drunk watch Rangeelo...,0.0,2023-03-27T17:54:50.000Z,Mon Mar 27 17:54:50 +0000 2023,0,0,,is there a reason i can’t drunk watch rangeelo...,"[reason, drink, watch, rangeelo, videos, beach..."


In [None]:
# join the tokens together again

df['clean_text'] = df['tokenized'].apply(lambda x: ' '.join([word for word in x]))
df.head()

Unnamed: 0,Tweet Id,Text,Sentiment,UTC,Created At,Favorites,Retweets,Unnamed: 7,clean_text,tokenized
0,1640413566561038338,ras are you in debt,-1.0,2023-03-27T18:00:46.000Z,Mon Mar 27 18:00:46 +0000 2023,2,0,,ras debt,"[ras, debt]"
1,1640412154800267264,guys at this rate of releasing season mixes I'...,0.0,2023-03-27T17:55:09.000Z,Mon Mar 27 17:55:09 +0000 2023,2,0,,guy rate release season mix im gon na anything...,"[guy, rate, release, season, mix, im, gon, na,..."
2,1640412115277320198,how do I practice this philthy set if they hav...,0.0,2023-03-27T17:55:00.000Z,Mon Mar 27 17:55:00 +0000 2023,0,0,,practice philthy set havent drop mix,"[practice, philthy, set, havent, drop, mix]"
3,1640412092569358336,O G KUSH WHERE ARE YOUR MIXES,0.0,2023-03-27T17:54:54.000Z,Mon Mar 27 17:54:54 +0000 2023,0,0,,g kush mix,"[g, kush, mix]"
4,1640412074378670080,Is there a reason I can’t drunk watch Rangeelo...,0.0,2023-03-27T17:54:50.000Z,Mon Mar 27 17:54:50 +0000 2023,0,0,,reason drink watch rangeelo videos beach sprin...,"[reason, drink, watch, rangeelo, videos, beach..."


## Sentiments

In [None]:
analyzer = SentimentIntensityAnalyzer()

def calc_sentiment(text):
  return TextBlob(text).sentiment

def calc_sentiment_analyzer(text):
  return analyzer.polarity_scores(text)

In [None]:
df['sentiment'] = df.clean_text.apply(calc_sentiment)
df['sentiment_analysis'] = df.clean_text.apply(calc_sentiment_analyzer)

s = pd.DataFrame(index = range(0,len(df)),columns= ['compound_score','compound_score_sentiment'])

for i in range(0,len(df)):
  s['compound_score'][i] = df['sentiment_analysis'][i]['compound']

  if (df['sentiment_analysis'][i]['compound'] <= -0.20):
    s['compound_score_sentiment'][i] = 'Negative'
  if (df['sentiment_analysis'][i]['compound'] >= 0.05):
    s['compound_score_sentiment'][i] = 'Positive'
  if ((df['sentiment_analysis'][i]['compound'] > -0.15) & (df['sentiment_analysis'][i]['compound'] < 0.20)):
    s['compound_score_sentiment'][i] = 'Neutral'

df['compound_score'] = s['compound_score']
df['compound_score_sentiment'] = s['compound_score_sentiment']

df.head()

Unnamed: 0,Tweet Id,Text,Sentiment,UTC,Created At,Favorites,Retweets,Unnamed: 7,clean_text,tokenized,sentiment,sentiment_analysis,compound_score,compound_score_sentiment
0,1640413566561038338,ras are you in debt,-1.0,2023-03-27T18:00:46.000Z,Mon Mar 27 18:00:46 +0000 2023,2,0,,ras debt,"[ras, debt]","(0.0, 0.0)","{'neg': 0.714, 'neu': 0.286, 'pos': 0.0, 'comp...",-0.3612,Negative
1,1640412154800267264,guys at this rate of releasing season mixes I'...,0.0,2023-03-27T17:55:09.000Z,Mon Mar 27 17:55:09 +0000 2023,2,0,,guy rate release season mix im gon na anything...,"[guy, rate, release, season, mix, im, gon, na,...","(0.0, 0.0)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Neutral
2,1640412115277320198,how do I practice this philthy set if they hav...,0.0,2023-03-27T17:55:00.000Z,Mon Mar 27 17:55:00 +0000 2023,0,0,,practice philthy set havent drop mix,"[practice, philthy, set, havent, drop, mix]","(0.0, 0.0)","{'neg': 0.0, 'neu': 0.734, 'pos': 0.266, 'comp...",0.2057,Positive
3,1640412092569358336,O G KUSH WHERE ARE YOUR MIXES,0.0,2023-03-27T17:54:54.000Z,Mon Mar 27 17:54:54 +0000 2023,0,0,,g kush mix,"[g, kush, mix]","(0.0, 0.0)","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,Neutral
4,1640412074378670080,Is there a reason I can’t drunk watch Rangeelo...,0.0,2023-03-27T17:54:50.000Z,Mon Mar 27 17:54:50 +0000 2023,0,0,,reason drink watch rangeelo videos beach sprin...,"[reason, drink, watch, rangeelo, videos, beach...","(0.0, 0.0)","{'neg': 0.223, 'neu': 0.777, 'pos': 0.0, 'comp...",-0.3182,Negative


In [None]:
df.compound_score_sentiment.value_counts()

Positive    1258
Neutral     1224
Negative     593
Name: compound_score_sentiment, dtype: int64

## TF-IDF and K Means

In [None]:
tf_idf_vect = CountVectorizer(analyzer='word',ngram_range=(1,2),stop_words='english', min_df = 0.0001)
tf_idf_vect.fit(df['clean_text'])
desc_matrix = tf_idf_vect.transform(df["clean_text"])

In [None]:
km = KMeans(n_clusters=2, max_iter=1000)
km.fit(desc_matrix)
clusters = km.labels_.tolist()



In [None]:
tweets = {'Tweet': df["clean_text"].tolist(), 'Cluster': clusters}
frame = pd.DataFrame(tweets)
frame

Unnamed: 0,Tweet,Cluster
0,ras debt,1
1,guy rate release season mix im gon na anything...,1
2,practice philthy set havent drop mix,1
3,g kush mix,1
4,reason drink watch rangeelo videos beach sprin...,1
...,...,...
3121,shoutout ras amaze kickoff season,1
3122,vts comeback year,1
3123,everyone put chaos together feel good back,1
3124,girl red dunk next nandin take pics gt show fi...,1


In [None]:
frame['Cluster'].value_counts()

1    2774
0     352
Name: Cluster, dtype: int64

In [None]:
frame[frame['Cluster'] == 0]

Unnamed: 0,Tweet,Cluster
29,nakhraas cap guy solo go absolutely bonkers te...,0
30,dissapointed ras show afterparty insane entire...,0
35,want rage text people team watch back row vide...,0
46,listen ga floor ticket ras xvi find venue body...,0
52,team already try choreo year see tiktok,0
...,...,...
3064,washington team pop year feel,0
3075,biggest sleeper team vcu nakhraas,0
3080,people give much hype team dont need rn season...,0
3090,team shoutout ru really nice others rude would...,0


In [None]:
frame[frame['Cluster'] == 1]

Unnamed: 0,Tweet,Cluster
0,ras debt,1
1,guy rate release season mix im gon na anything...,1
2,practice philthy set havent drop mix,1
3,g kush mix,1
4,reason drink watch rangeelo videos beach sprin...,1
...,...,...
3121,shoutout ras amaze kickoff season,1
3122,vts comeback year,1
3123,everyone put chaos together feel good back,1
3124,girl red dunk next nandin take pics gt show fi...,1


In [None]:
frame[frame['Cluster'] == 2]

Unnamed: 0,Tweet,Cluster
43,look forward see ucf philthy ras next year,2
48,ucf girls songs easily one best season keep re...,2
50,yea sorry ucf u guy really good ideas go take ras,2
56,osu origins ucf fav set year even close,2
57,rushil ucf single,2
...,...,...
2531,two brothers ucf goatees fun talk anyone insta...,2
2991,look forward ucf rampage,2
3015,rova tulsi uf parth uf ricky gt siddhi purdue ...,2
3024,hear ucf fell year,2


In [None]:
frame[frame['Cluster'] == 3]

Unnamed: 0,Tweet,Cluster
29,nakhraas cap guy solo go absolutely bonkers te...,3
30,dissapointed ras show afterparty insane entire...,3
35,want rage text people team watch back row vide...,3
46,listen ga floor ticket ras xvi find venue body...,3
52,team already try choreo year see tiktok,3
...,...,...
3064,washington team pop year feel,3
3075,biggest sleeper team vcu nakhraas,3
3080,people give much hype team dont need rn season...,3
3090,team shoutout ru really nice others rude would...,3


In [None]:
frame[frame['Cluster'] == 4]

Unnamed: 0,Tweet,Cluster
