# Data Pre-processing

### Import Libraries

In [1]:
# main libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

# For Data processing/cleaning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import STOPWORDS
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
import os
import warnings

In [2]:
dataset = pd.read_csv("All_Data.csv")

In [3]:
dataset.shape

(36701, 3)

In [4]:
dataset.head()

Unnamed: 0,date,user,text
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav..."
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...


In [5]:
dataset.isna().any()

date    False
user    False
text    False
dtype: bool

In [6]:
df2 = dataset.drop_duplicates(subset=["text"], keep='first')
df2.shape

(29596, 3)

### Removing @names


In [7]:
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

In [8]:
df2['tidy_tweets'] = np.vectorize(remove_pattern)(df2['text'], "@[\w]*")

df2.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['tidy_tweets'] = np.vectorize(remove_pattern)(df2['text'], "@[\w]*")


Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...","Hello, #AcademicTwitter,\n\n🎙️""You need to hav..."
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
5,2022-11-24 15:46:35+00:00,zellyze,@angewwaa same beh 😭 academicbreaknow!!!,same beh 😭 academicbreaknow!!!
6,2022-11-24 15:40:42+00:00,louvri_,#AcademicBreakNow,#AcademicBreakNow
7,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,#/academicbreaknow tsngina pagod na 'ko magpaypay
8,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...","Pagoda ang accla, 4hours tulog gising 3:40am l..."
10,2022-11-22 15:31:24+00:00,rielles_cart,"sa letra ng p, putangina pagod na ko #academic...","sa letra ng p, putangina pagod na ko #academic..."


###  Removing Punctuations, Numbers, and Special Characters

In [10]:
df2['tidy_tweets'] = df2['tidy_tweets'].str.replace("[^a-zA-Z#]", " ")
df2.head(10)

  df2['tidy_tweets'] = df2['tidy_tweets'].str.replace("[^a-zA-Z#]", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['tidy_tweets'] = df2['tidy_tweets'].str.replace("[^a-zA-Z#]", " ")


Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have ...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
5,2022-11-24 15:46:35+00:00,zellyze,@angewwaa same beh 😭 academicbreaknow!!!,same beh academicbreaknow
6,2022-11-24 15:40:42+00:00,louvri_,#AcademicBreakNow,#AcademicBreakNow
7,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,# academicbreaknow tsngina pagod na ko magpaypay
8,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...",Pagoda ang accla hours tulog gising am l...
10,2022-11-22 15:31:24+00:00,rielles_cart,"sa letra ng p, putangina pagod na ko #academic...",sa letra ng p putangina pagod na ko #academic...


### Removing links

In [11]:
cleaned_tweets = []

for index, row in df2.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.tidy_tweets.split() if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

df2['tidy_tweets'] = cleaned_tweets
df2.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['tidy_tweets'] = cleaned_tweets


Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
5,2022-11-24 15:46:35+00:00,zellyze,@angewwaa same beh 😭 academicbreaknow!!!,same beh academicbreaknow
6,2022-11-24 15:40:42+00:00,louvri_,#AcademicBreakNow,#AcademicBreakNow
7,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,# academicbreaknow tsngina pagod na ko magpaypay
8,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...",Pagoda ang accla hours tulog gising am lecture...
10,2022-11-22 15:31:24+00:00,rielles_cart,"sa letra ng p, putangina pagod na ko #academic...",sa letra ng p putangina pagod na ko #academicb...


### Remove rows with empty texts

In [13]:
tweets_df = df2[df2['tidy_tweets']!='']
tweets_df.head(10)

Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
5,2022-11-24 15:46:35+00:00,zellyze,@angewwaa same beh 😭 academicbreaknow!!!,same beh academicbreaknow
6,2022-11-24 15:40:42+00:00,louvri_,#AcademicBreakNow,#AcademicBreakNow
7,2022-11-24 12:46:22+00:00,_patreng_,#/academicbreaknow tsngina pagod na 'ko magpaypay,# academicbreaknow tsngina pagod na ko magpaypay
8,2022-11-24 08:18:08+00:00,willowveewise,"Pagoda ang accla, 4hours tulog gising 3:40am l...",Pagoda ang accla hours tulog gising am lecture...
10,2022-11-22 15:31:24+00:00,rielles_cart,"sa letra ng p, putangina pagod na ko #academic...",sa letra ng p putangina pagod na ko #academicb...


### Drop Duplicates

In [14]:
tweets_df.drop_duplicates(subset=['tidy_tweets'], keep='first')
tweets_df.shape

(29593, 4)

### Reset Index


In [15]:
tweets_df = tweets_df.reset_index(drop=True)
tweets_df

Unnamed: 0,date,user,text,tidy_tweets
0,2022-11-29 04:15:58+00:00,maffyolfato,#academicbreaknow,#academicbreaknow
1,2022-11-27 12:18:49+00:00,researcheff,"Hello, #AcademicTwitter,\n\n🎙️""You need to hav...",Hello #AcademicTwitter You need to have a rest...
2,2022-11-27 10:30:42+00:00,dprleanne,lunes nanaman bukas #academicbreaknow,lunes nanaman bukas #academicbreaknow
3,2022-11-27 09:02:47+00:00,ericakieraa,#AcademicBreakNow gusto q na tapusin Wednesday,#AcademicBreakNow gusto q na tapusin Wednesday
4,2022-11-25 13:30:42+00:00,qin_ina,super delay na ako sa tbw list ko #academicbre...,super delay na ako sa tbw list ko #academicbre...
...,...,...,...,...
29588,2022-10-21 05:27:01+00:00,dinnyyyyyyyy,@aicannot si taylor nay nag implement ug acade...,si taylor nay nag implement ug academic break
29589,2022-10-21 05:24:05+00:00,stalinistberet,"@RodericDay ""Actually Lenin wasn't peer review...",Actually Lenin wasn t peer reviewed by establi...
29590,2022-10-21 05:15:33+00:00,6Senoritamae,Academic break plss😭,Academic break plss
29591,2022-10-21 04:49:59+00:00,treysinsmxx,tangina hirap magpa chill chill hahaha daming ...,tangina hirap magpa chill chill hahaha daming ...


In [17]:
tweets_df.shape

(29593, 4)