In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import re 
import spacy

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/game-review-dataset/train_gr/train.csv")

df.sample(5)

## Okay, I chose to use only the training dataset available to facilitate the analysis and later classify the reviews 

In [None]:
df.info()

In [None]:
df.isna().sum()

#### Ok, the Year column has some null values but it's not important now. 

In [None]:
df['user_suggestion'].value_counts()

## Number of characters 

In [None]:
class_len = df['user_review'].str.len()

plt.figure(figsize=(16,8))
plt.hist(class_len)
plt.title('Characters in Reviews')

plt.show()

## Number of words in a review 

In [None]:
class_len = df['user_review'].str.split().map(lambda text: len(text))

plt.figure(figsize=(16,8))
plt.hist(class_len)
plt.title('Words in a review')

plt.show()

## Common stopwords, punctuations and words in reviews 

In [None]:
import nltk 
from nltk.corpus import stopwords 
import string
import collections

STOPWORDS = stopwords.words('english')

In [None]:
print(STOPWORDS)

In [None]:
def create_corpus():
    corpus = []
    
    for sentence in df['user_review'].str.split():
        for word in sentence:
            corpus.append(word.lower())
    
    return corpus

## First, let's look at the top 20 most common stopwords 

In [None]:
corpus = create_corpus()
dic = collections.defaultdict(int)

for word in corpus:
    if word in STOPWORDS:
        dic[word] += 1

x, y = zip(*sorted(dic.items(), key=lambda word: word[1], reverse=True)[:20])

plt.figure(figsize=(16, 8))
plt.bar(x, y)

## Now, the top 20 most common punctuation

In [None]:
corpus = create_corpus()
dic = collections.defaultdict(int)

for word in corpus:
    for char in word:
        if char in string.punctuation:
            dic[char] += 1

x, y = zip(*sorted(dic.items(), key=lambda word: word[1], reverse=True)[:20])

plt.figure(figsize=(16,8))
plt.bar(x, y)

## Ok, let's look at the top most common words in reviews 

In [None]:
corpus = create_corpus()
counter = collections.Counter(corpus)
most_common = counter.most_common()

x, y = [], []

for word, count in most_common[:50]:
    if word not in STOPWORDS:
        x.append(word)
        y.append(count)

In [None]:
import seaborn as sns 

plt.figure(figsize=(16, 8))
sns.barplot(x=y, y=x)

## Wordcloud 

In [None]:
from wordcloud import WordCloud

all_words = ' '.join(create_corpus())

wordcloud = WordCloud(max_font_size=50, background_color='white', stopwords=STOPWORDS).generate(all_words)

plt.figure(figsize=(15,5))
plt.imshow(wordcloud, interpolation='bilinear')

plt.show()

## Now that we've analyzed the reviews, let's pre-process the texts 

### Steps:
* Apply lowercase 
* Remove punctuation
* Remove numbers
* Remove stopwords
* Remove white spaces
* Apply lemmatization 

In [None]:
## APPLY LOWERCASE ON REVIEWS 
df['review_cleaned'] = df['user_review'].apply(lambda text: text.lower())

In [None]:
## REMOVE PUNCTUATION 
def remove_punctuation(sentence):
    return ''.join([word for word in str(sentence) if word not in string.punctuation])

df['review_cleaned'] = df['review_cleaned'].apply(lambda text: remove_punctuation(text))

In [None]:
## REMOVE NUMBERS 
def remove_numbers(sentence):
    return re.sub(r'\d+', '', sentence)

df['review_cleaned'] = df['review_cleaned'].apply(lambda text: remove_numbers(text))

In [None]:
## REMOVE STOPWORDS 
def remove_stopwords(sentence):
    return ' '.join([word for word in str(sentence).split() if word not in STOPWORDS])
    
df['review_cleaned'] = df['review_cleaned'].apply(lambda text: remove_stopwords(text))

In [None]:
## REMOVE WHITE SPACES 
def remove_spaces(sentence):
    return re.sub(r'\s+', ' ', sentence).strip()

df['review_cleaned'] = df['review_cleaned'].apply(lambda text: remove_spaces(text))

In [None]:
## APPLY LEMMATIZATION 
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatizer_doc(sentence):
    doc = nlp(sentence)
    new_sentence = [token.lemma_ for token in doc if token.is_alpha]
    
    return ' '.join(new_sentence)

df['review_cleaned'] = df['review_cleaned'].apply(lambda text: lemmatizer_doc(text))

## See the difference between the original review and the review after the pre-processing performed 

In [None]:
print('Original review\n', df['user_review'].iloc[0])
print('\nReview clear\n',df['review_cleaned'].iloc[0])