In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.decomposition import PCA
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/mehdi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mehdi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Pickle savings helpers

In [5]:
PATH = './data/ruddit.csv'
TRAIN_RATIO = 0.75
TEST_VAL_RATIO = 1

dataset = pd.read_csv(PATH)
x_train, x_test_valid, y_train, y_test_valid = train_test_split(dataset["comment_text"], dataset['offensiveness_score'] , train_size=TRAIN_RATIO, random_state=0)
x_test, x_valid, y_test, y_valid = train_test_split(x_test_valid, y_test_valid, test_size=TEST_VAL_RATIO, random_state=0)
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
x_valid = x_valid.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
train_raw = pd.DataFrame({'text': x_train, 'score': y_train})
test_raw = pd.DataFrame({'text': x_test, 'score': y_test})
valid_raw = pd.DataFrame({'text': x_valid, 'score': y_valid})
train_raw['score'] = train_raw['score'].astype('float32')
test_raw['score'] = test_raw['score'].astype('float32')
valid_raw['score'] = valid_raw['score'].astype('float32')
del x_train, x_test, x_valid, y_train, y_test, y_valid, x_test_valid, y_test_valid
len(train_raw), len(test_raw), len(valid_raw)

(4225, 1408, 1)

In [6]:
wl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    pattern = re.compile('[^a-zA-Z]')
    text = pattern.sub(' ', text)
    text = text.lower()
    text = text.split()
    text = [wl.lemmatize(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

def clean_dataset(ds):
    ds['text'] = ds['text'].apply(preprocess_text)
    return ds

train = clean_dataset(train_raw.copy())
test = clean_dataset(test_raw.copy())
valid = clean_dataset(valid_raw.copy())

In [17]:
def dataset_statistics(dataset_df, labels=False):
    tokens_count = 0
    min_tokens = 10000000
    max_tokens = 0
    labels_count = []
    for index, row in dataset_df.iterrows():
        print(index, end='\r')
        sentence_tokens_count = len(word_tokenize(row['text']))
        tokens_count += sentence_tokens_count
        if sentence_tokens_count > max_tokens:
            max_tokens = sentence_tokens_count
        if sentence_tokens_count < min_tokens:
            min_tokens = sentence_tokens_count
        if 'score' in row and labels:
            labels_count.append(len(row['score']))
    print("Number of documents: ", len(dataset_df))
    print("Average number of tokens: ", tokens_count/len(dataset_df))
    print("Max number of tokens: ", max_tokens)
    print("Min number of tokens: ", min_tokens)

    print("score's stats:")

    print(dataset_df['score'].describe())

In [18]:
dataset_statistics(train)

Number of documents:  4225
Average number of tokens:  17.59360946745562
Max number of tokens:  83
Min number of tokens:  0
score's stats:
count    4225.000000
mean       -0.031772
std         0.329498
min        -0.889000
25%        -0.261000
50%        -0.062000
75%         0.146000
max         0.938000
Name: score, dtype: float64


In [19]:
dataset_statistics(test)

Number of documents:  1408
Average number of tokens:  17.28409090909091
Max number of tokens:  80
Min number of tokens:  1
score's stats:
count    1408.000000
mean       -0.024315
std         0.343100
min        -0.875000
25%        -0.271000
50%        -0.062000
75%         0.188000
max         0.979000
Name: score, dtype: float64
