In [None]:
import pandas as pd
import numpy as np
import re
import os

import nltk
from nltk.corpus import stopwords
from lemminflect import getLemma
import spacy

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
dataset = pd.read_csv('/content/raw_data.csv')
dataset.head()

In [None]:
df = pd.DataFrame()
df['review'] = dataset[['content']]
df.head()

Unnamed: 0,review
0,"Ever since the update, there's a weird glitch ..."
1,Don't believe the news!!! You can absolutely c...
2,Great app. Too many ads. If you saw a video an...
3,"Good app, but there's a glitch that I've had i..."
4,The creator of this app created an algorithm t...


In [None]:
df['review'] = df['review'].map(lambda x: str(x))

In [None]:
# Remove unicode
df['cleaned_review'] = df['review'].map(lambda x: x.encode('ascii', 'ignore').decode())
# Lower text
df['cleaned_review'] = df['cleaned_review'].map(lambda x: x.lower())
# Remove URL
df['cleaned_review'] = df['cleaned_review'].map(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))
# Remove number and special characters
df['cleaned_review'] = df['cleaned_review'].map(lambda x: re.sub(r'[^a-zA-Z]',' ',x))
# Remove extra whitespaces
df['cleaned_review'] = df['cleaned_review'].map(lambda x: re.sub(r'^\s*|\s\s*', ' ', x).strip())

In [None]:
df.head()

Unnamed: 0,review,cleaned_review
0,"Ever since the update, there's a weird glitch ...",ever since the update there s a weird glitch w...
1,Don't believe the news!!! You can absolutely c...,don t believe the news you can absolutely cont...
2,Great app. Too many ads. If you saw a video an...,great app too many ads if you saw a video and ...
3,"Good app, but there's a glitch that I've had i...",good app but there s a glitch that i ve had is...
4,The creator of this app created an algorithm t...,the creator of this app created an algorithm t...


In [None]:
# Tokenization

df['cleaned_review'] = df['cleaned_review'].map(lambda x: x.split())
df.head()

Unnamed: 0,review,cleaned_review
0,"Ever since the update, there's a weird glitch ...","[ever, since, ﻿the, update, there, a, a, weird..."
1,Don't believe the news!!! You can absolutely c...,"[don, to, believe, ﻿the, news, you, can, absol..."
2,Great app. Too many ads. If you saw a video an...,"[great, app, too, many, ads, if, you, saw, a, ..."
3,"Good app, but there's a glitch that I've had i...","[good, app, but, there, a, a, glitch, that, i,..."
4,The creator of this app created an algorithm t...,"[﻿the, creator, of, this, app, created, an, al..."


In [None]:
# Stopword Removal

stop_words = set(stopwords.words('english'))
df['cleaned_review'] = df['cleaned_review'].map(lambda x: [word for word in x if word not in stop_words])
df.head()


Unnamed: 0,review,cleaned_review
0,"Ever since the update, there's a weird glitch ...","[ever, since, ﻿the, update, weird, glitch, tur..."
1,Don't believe the news!!! You can absolutely c...,"[believe, ﻿the, news, absolutely, control, app..."
2,Great app. Too many ads. If you saw a video an...,"[great, app, many, ads, saw, video, lose, lost..."
3,"Good app, but there's a glitch that I've had i...","[good, app, glitch, issues, awhile, solution, ..."
4,The creator of this app created an algorithm t...,"[﻿the, creator, app, created, algorithm, let, ..."


In [None]:
# Lemmatization

nlp = spacy.load("en_core_web_sm")

def lemmatize_with_lemminflect(word, pos):
    lemma = getLemma(word, pos)
    if lemma:
        return lemma[0]
    else:
        return word

def lemmatize_text_with_lemminflect(text):
    doc = nlp(text)
    lemmatized_tokens = [lemmatize_with_lemminflect(token.text, token.pos_) for token in doc]
    return ' '.join(lemmatized_tokens)

df['cleaned_review'] = df['cleaned_review'].map(lambda x: lemmatize_text_with_lemminflect(' '.join(x)))

In [None]:
df

Unnamed: 0,review,cleaned_review
0,"Ever since the update, there's a weird glitch ...",ever since update weird glitch turn subtitle e...
1,Don't believe the news!!! You can absolutely c...,believe news absolutely control app access pho...
2,Great app. Too many ads. If you saw a video an...,great app many ad see video lose lose forever ...
3,"Good app, but there's a glitch that I've had i...",good app glitch issue awhile solution big save...
4,The creator of this app created an algorithm t...,creator app create algorithm let choose well b...
...,...,...
14995,THIS IS THE BEST GAME EVER I PLAY THIS THE MOS...,good game ever play idea make even good people...
14996,PLEASE FIX THIS GLITCH! So sometimes when I jo...,please fix glitch sometimes join game move jum...
14997,i love roblox it is my favorite game to play b...,love roblox favorite game play wish will chang...
14998,This game is pretty cool but there is just one...,game pretty cool one actually there s two prob...


In [None]:
df.to_csv('/content/preped_data_scrapped.csv', index=False)