# Data Cleaning 

In [1]:
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt
from textblob import TextBlob
from pandarallel import pandarallel
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

In [2]:
df = pd.read_csv("new_data.csv", index_col=0)
df

Unnamed: 0_level_0,data,data_list,spelling_mistakes
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
655186867e442d2d9b63da4f,Megumi-chan is a girl with a secret past. She ...,"['Megumi-chan', 'is', 'a', 'girl', 'with', 'a'...",True
655186867e442d2d9b63da50,"Just where do Curses, the fiendish spirits inv...","['Just', 'where', 'do', 'Curses,', 'the', 'fie...",True
655186867e442d2d9b63da51,"Years after the Fourth Holy Grail War, Waver V...","['Years', 'after', 'the', 'Fourth', 'Holy', 'G...",True
655186867e442d2d9b63da52,Fudou Aikawa and Desumi Magahara have just sta...,"['Fudou', 'Aikawa', 'and', 'Desumi', 'Magahara...",True
655186867e442d2d9b63da53,"Having completed their Stagiaire assignments, ...","['Having', 'completed', 'their', 'Stagiaire', ...",True
...,...,...,...
655186877e442d2d9b63edd2,"In the year 2314 AD, the world is at peace. Th...","['In', 'the', 'year', '2314', 'AD,', 'the', 'w...",True
655186877e442d2d9b63edd3,It is 2977 AD and mankind has become stagnant....,"['It', 'is', '2977', 'AD', 'and', 'mankind', '...",True
655186877e442d2d9b63edd4,"After a fatal encounter with a truck, Takafumi...","['After', 'a', 'fatal', 'encounter', 'with', '...",True
655186877e442d2d9b63edd5,Haruhi Fujioka is a studious girl who has rece...,"['Haruhi', 'Fujioka', 'is', 'a', 'studious', '...",True


## Remove all stopwords and punctuation

In [3]:
def remove_stopwords_and_punctuation(text):
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    words = [word for word in words if word not in string.punctuation]
    cleaned_text = ' '.join(words)
    return cleaned_text

df['data_cleaned'] = df['data'].apply(remove_stopwords_and_punctuation)
df

Unnamed: 0_level_0,data,data_list,spelling_mistakes,data_cleaned
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
655186867e442d2d9b63da4f,Megumi-chan is a girl with a secret past. She ...,"['Megumi-chan', 'is', 'a', 'girl', 'with', 'a'...",True,Megumi-chan girl secret past. used boy met per...
655186867e442d2d9b63da50,"Just where do Curses, the fiendish spirits inv...","['Just', 'where', 'do', 'Curses,', 'the', 'fie...",True,"Curses, fiendish spirits invisible normal huma..."
655186867e442d2d9b63da51,"Years after the Fourth Holy Grail War, Waver V...","['Years', 'after', 'the', 'Fourth', 'Holy', 'G...",True,"Years Fourth Holy Grail War, Waver Velvet beco..."
655186867e442d2d9b63da52,Fudou Aikawa and Desumi Magahara have just sta...,"['Fudou', 'Aikawa', 'and', 'Desumi', 'Magahara...",True,"Fudou Aikawa Desumi Magahara started dating, o..."
655186867e442d2d9b63da53,"Having completed their Stagiaire assignments, ...","['Having', 'completed', 'their', 'Stagiaire', ...",True,"completed Stagiaire assignments, residents Pol..."
...,...,...,...,...
655186877e442d2d9b63edd2,"In the year 2314 AD, the world is at peace. Th...","['In', 'the', 'year', '2314', 'AD,', 'the', 'w...",True,"year 2314 AD, world peace. Thanks sacrifices C..."
655186877e442d2d9b63edd3,It is 2977 AD and mankind has become stagnant....,"['It', 'is', '2977', 'AD', 'and', 'mankind', '...",True,"2977 AD mankind become stagnant. Robots work, ..."
655186877e442d2d9b63edd4,"After a fatal encounter with a truck, Takafumi...","['After', 'a', 'fatal', 'encounter', 'with', '...",True,"fatal encounter truck, Takafumi Takaoka's uncl..."
655186877e442d2d9b63edd5,Haruhi Fujioka is a studious girl who has rece...,"['Haruhi', 'Fujioka', 'is', 'a', 'studious', '...",True,Haruhi Fujioka studious girl recently enrolled...


## Print the 10 most frequent words in all textual content.

In [4]:
all_words = [word for sublist in df['data_cleaned'].str.split() for word in sublist]

word_counts = Counter(all_words)
most_common_words = word_counts.most_common(10)

print("Top 10 most frequent words:")
for word, count in most_common_words:
    print(f"{word}: {count} times")

Top 10 most frequent words:
MAL: 2430 times
[Written: 2381 times
Rewrite]: 2380 times
(Source:: 1382 times
new: 1348 times
one: 1228 times
However,: 1185 times
school: 892 times
world: 884 times
two: 815 times


## Convert all words to small letters.

In [5]:
df["data_cleaned"] = df["data_cleaned"].apply(lambda x: x.lower())
df

Unnamed: 0_level_0,data,data_list,spelling_mistakes,data_cleaned
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
655186867e442d2d9b63da4f,Megumi-chan is a girl with a secret past. She ...,"['Megumi-chan', 'is', 'a', 'girl', 'with', 'a'...",True,megumi-chan girl secret past. used boy met per...
655186867e442d2d9b63da50,"Just where do Curses, the fiendish spirits inv...","['Just', 'where', 'do', 'Curses,', 'the', 'fie...",True,"curses, fiendish spirits invisible normal huma..."
655186867e442d2d9b63da51,"Years after the Fourth Holy Grail War, Waver V...","['Years', 'after', 'the', 'Fourth', 'Holy', 'G...",True,"years fourth holy grail war, waver velvet beco..."
655186867e442d2d9b63da52,Fudou Aikawa and Desumi Magahara have just sta...,"['Fudou', 'Aikawa', 'and', 'Desumi', 'Magahara...",True,"fudou aikawa desumi magahara started dating, o..."
655186867e442d2d9b63da53,"Having completed their Stagiaire assignments, ...","['Having', 'completed', 'their', 'Stagiaire', ...",True,"completed stagiaire assignments, residents pol..."
...,...,...,...,...
655186877e442d2d9b63edd2,"In the year 2314 AD, the world is at peace. Th...","['In', 'the', 'year', '2314', 'AD,', 'the', 'w...",True,"year 2314 ad, world peace. thanks sacrifices c..."
655186877e442d2d9b63edd3,It is 2977 AD and mankind has become stagnant....,"['It', 'is', '2977', 'AD', 'and', 'mankind', '...",True,"2977 ad mankind become stagnant. robots work, ..."
655186877e442d2d9b63edd4,"After a fatal encounter with a truck, Takafumi...","['After', 'a', 'fatal', 'encounter', 'with', '...",True,"fatal encounter truck, takafumi takaoka's uncl..."
655186877e442d2d9b63edd5,Haruhi Fujioka is a studious girl who has rece...,"['Haruhi', 'Fujioka', 'is', 'a', 'studious', '...",True,haruhi fujioka studious girl recently enrolled...


## Print the count of unique words in the text.

In [6]:
all_words = [word for sublist in df['data_cleaned'].str.split() for word in sublist]
len(set(all_words))

54166

## Replace all typos with the correct word.

In [7]:
def check_spelling_mistakes(text):
    words = text.split()
    corrected_words = []

    for word in words:
        blob = TextBlob(word)
        corrected_word = str(blob.correct())
        corrected_words.append(corrected_word)

    corrected_text = ' '.join(corrected_words)
    return text if text == corrected_text else corrected_text

pandarallel.initialize(nb_workers=12)

results = df['data_cleaned'].parallel_apply(check_spelling_mistakes)
df['correct_text'] = results
df

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0_level_0,data,data_list,spelling_mistakes,data_cleaned,correct_text
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
655186867e442d2d9b63da4f,Megumi-chan is a girl with a secret past. She ...,"['Megumi-chan', 'is', 'a', 'girl', 'with', 'a'...",True,megumi-chan girl secret past. used boy met per...,megumi-than girl secret past. used boy met per...
655186867e442d2d9b63da50,"Just where do Curses, the fiendish spirits inv...","['Just', 'where', 'do', 'Curses,', 'the', 'fie...",True,"curses, fiendish spirits invisible normal huma...","curses, finish spirits invisible normal humans..."
655186867e442d2d9b63da51,"Years after the Fourth Holy Grail War, Waver V...","['Years', 'after', 'the', 'Fourth', 'Holy', 'G...",True,"years fourth holy grail war, waver velvet beco...","years fourth holy grain war, waver velvet beco..."
655186867e442d2d9b63da52,Fudou Aikawa and Desumi Magahara have just sta...,"['Fudou', 'Aikawa', 'and', 'Desumi', 'Magahara...",True,"fudou aikawa desumi magahara started dating, o...","fedor aikawa resume magahara started dating, o..."
655186867e442d2d9b63da53,"Having completed their Stagiaire assignments, ...","['Having', 'completed', 'their', 'Stagiaire', ...",True,"completed stagiaire assignments, residents pol...","completed stagiaire assignment, residents pola..."
...,...,...,...,...,...
655186877e442d2d9b63edd2,"In the year 2314 AD, the world is at peace. Th...","['In', 'the', 'year', '2314', 'AD,', 'the', 'w...",True,"year 2314 ad, world peace. thanks sacrifices c...","year 2314 ad, world peace. thanks sacrifices c..."
655186877e442d2d9b63edd3,It is 2977 AD and mankind has become stagnant....,"['It', 'is', '2977', 'AD', 'and', 'mankind', '...",True,"2977 ad mankind become stagnant. robots work, ...","2977 ad mankind become stagnant. roots work, m..."
655186877e442d2d9b63edd4,"After a fatal encounter with a truck, Takafumi...","['After', 'a', 'fatal', 'encounter', 'with', '...",True,"fatal encounter truck, takafumi takaoka's uncl...","fatal encounter truck, takafumi makarka's uncl..."
655186877e442d2d9b63edd5,Haruhi Fujioka is a studious girl who has rece...,"['Haruhi', 'Fujioka', 'is', 'a', 'studious', '...",True,haruhi fujioka studious girl recently enrolled...,harsh fujioka studious girl recently enrolled ...


In [11]:
df.iloc[0]["data_cleaned"]

"megumi-chan girl secret past. used boy met person thought magic user. person gave him/her magical book genie appears grant one wish blood applied it. megumi made wish man man's body genie twist: grants wishes backwards turns megumi-kun aged 9 megumi-chan. years pass megumi enters high school immediately beats school bully course falls love her. looking book able reverse spell placed upon her. (source: ann)"

In [12]:
df.iloc[0]["correct_text"]

"megumi-than girl secret past. used boy met person thought magic user. person gave him/her magical book genii appears grant one wish blood applied it. megumi made wish man man's body genii twist: grants wishes backwards turns megumi-run aged 9 megumi-than. years pass megumi enters high school immediately beats school bully course falls love her. looking book able reverse spell placed upon her. (source: ann)"

## Takking the words to its root (Apply a lemmatization technique)

In [13]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

df['lemmatized_text'] = df['correct_text'].parallel_apply(lemmatize_text)
df.to_csv("cleaned_data.csv", index=False)
df

Unnamed: 0_level_0,data,data_list,spelling_mistakes,data_cleaned,correct_text,lemmatized_text
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
655186867e442d2d9b63da4f,Megumi-chan is a girl with a secret past. She ...,"['Megumi-chan', 'is', 'a', 'girl', 'with', 'a'...",True,megumi-chan girl secret past. used boy met per...,megumi-than girl secret past. used boy met per...,megumi-than girl secret past. used boy met per...
655186867e442d2d9b63da50,"Just where do Curses, the fiendish spirits inv...","['Just', 'where', 'do', 'Curses,', 'the', 'fie...",True,"curses, fiendish spirits invisible normal huma...","curses, finish spirits invisible normal humans...","curses, finish spirit invisible normal humans,..."
655186867e442d2d9b63da51,"Years after the Fourth Holy Grail War, Waver V...","['Years', 'after', 'the', 'Fourth', 'Holy', 'G...",True,"years fourth holy grail war, waver velvet beco...","years fourth holy grain war, waver velvet beco...","year fourth holy grain war, waver velvet becom..."
655186867e442d2d9b63da52,Fudou Aikawa and Desumi Magahara have just sta...,"['Fudou', 'Aikawa', 'and', 'Desumi', 'Magahara...",True,"fudou aikawa desumi magahara started dating, o...","fedor aikawa resume magahara started dating, o...","fedor aikawa resume magahara started dating, o..."
655186867e442d2d9b63da53,"Having completed their Stagiaire assignments, ...","['Having', 'completed', 'their', 'Stagiaire', ...",True,"completed stagiaire assignments, residents pol...","completed stagiaire assignment, residents pola...","completed stagiaire assignment, resident polar..."
...,...,...,...,...,...,...
655186877e442d2d9b63edd2,"In the year 2314 AD, the world is at peace. Th...","['In', 'the', 'year', '2314', 'AD,', 'the', 'w...",True,"year 2314 ad, world peace. thanks sacrifices c...","year 2314 ad, world peace. thanks sacrifices c...","year 2314 ad, world peace. thanks sacrifice ce..."
655186877e442d2d9b63edd3,It is 2977 AD and mankind has become stagnant....,"['It', 'is', '2977', 'AD', 'and', 'mankind', '...",True,"2977 ad mankind become stagnant. robots work, ...","2977 ad mankind become stagnant. roots work, m...","2977 ad mankind become stagnant. root work, ma..."
655186877e442d2d9b63edd4,"After a fatal encounter with a truck, Takafumi...","['After', 'a', 'fatal', 'encounter', 'with', '...",True,"fatal encounter truck, takafumi takaoka's uncl...","fatal encounter truck, takafumi makarka's uncl...","fatal encounter truck, takafumi makarka's uncl..."
655186877e442d2d9b63edd5,Haruhi Fujioka is a studious girl who has rece...,"['Haruhi', 'Fujioka', 'is', 'a', 'studious', '...",True,haruhi fujioka studious girl recently enrolled...,harsh fujioka studious girl recently enrolled ...,harsh fujioka studious girl recently enrolled ...


In [14]:
df.iloc[0]["correct_text"]

"megumi-than girl secret past. used boy met person thought magic user. person gave him/her magical book genii appears grant one wish blood applied it. megumi made wish man man's body genii twist: grants wishes backwards turns megumi-run aged 9 megumi-than. years pass megumi enters high school immediately beats school bully course falls love her. looking book able reverse spell placed upon her. (source: ann)"

In [15]:
df.iloc[0]["lemmatized_text"]

"megumi-than girl secret past. used boy met person thought magic user. person gave him/her magical book genius appears grant one wish blood applied it. megumi made wish man man's body genius twist: grant wish backwards turn megumi-run aged 9 megumi-than. year pas megumi enters high school immediately beat school bully course fall love her. looking book able reverse spell placed upon her. (source: ann)"

## Print the 10 most frequent words in all textual content.

In [16]:
all_words = [word for sublist in df['lemmatized_text'].str.split() for word in sublist]
len(set(all_words))

43258