In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from nltk.corpus import words

from nltk.stem import WordNetLemmatizer

from pandarallel import pandarallel
from bs4 import BeautifulSoup

In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [3]:
pandarallel.initialize(progress_bar=True,
                        nb_workers=6,
                        #verbose=1
                       )

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
df = pd.read_csv('/Users/maurelco/Developer/Python/Projet 4/data/Source/QueryResults-3.csv')

In [5]:
df = pd.read_csv('/Users/maurelco/Developer/Python/Projet 4/data/cleaned/df_cleaned.csv')

In [6]:
df

Unnamed: 0,Title,Body,Tags
0,Giving a unix process exclusive RW access to a...,is there a way to sandbox a linux process into...,<linux><ubuntu><process><sandbox><selinux>
1,automatic repaint when minimizing window,"i have a jframe, with two panels, in one panel...",<java><graphics><jframe><jpanel><paint>
2,Is man-in-the-middle attack a security threat ...,"i am no expert in network security, so pardon ...",<security><ssh><ssh-keys><openssh><man-in-the-...
3,Managing data access in a simple WinForms app,i have a simple winforms data entry app that u...,<c#><winforms><sqlite><datatable><sqlconnection>
4,Render basic HTML view?,i have a basic node.js app that i am trying to...,<javascript><html><node.js><mongodb><express>
...,...,...,...
49995,Bypass Vertica ERROR 3326: Execution time exce...,"using the ssis tool and ole db, we are downloa...",<sql-server><ssas><oledb><sql-server-data-tool...
49996,A conflicting conditional operation is current...,"using s3fs, i am uploading a file to the alrea...",<python><amazon-web-services><amazon-s3><boto3...
49997,Problem in lr_find() in Pytorch fastai course,while following the jupyter notebooks for the ...,<python><machine-learning><deep-learning><pyto...
49998,JSONPatch escape slash '/' from JSONPatch+JSON,i've below json and i wanted to update few fie...,<java><json><rest><json-patch><http-patch>


### 1. Data cleaning

#### 1.1 NaN values

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df.isna().mean()

In [None]:
sns.heatmap(df.isna())

#### 1.2 Duplicated values

In [None]:
df.duplicated().sum()

In [None]:
df.duplicated(subset='Body').sum()

#### 1.3 Selection of important features

In [None]:
tmp= df.dtypes
tmp

In [None]:
cols = ['Title','Body','Tags']
df = df[cols]
df.head(5)

#### 1.4 Delete HTML and Lower text

In [None]:
df['Body']= df['Body'].apply(lambda x: (BeautifulSoup(x).get_text()).lower())
df.head(10)

In [7]:
df['Title']= df['Title'].apply(lambda x: x.lower())
df.head(10)

Unnamed: 0,Title,Body,Tags
0,giving a unix process exclusive rw access to a...,is there a way to sandbox a linux process into...,<linux><ubuntu><process><sandbox><selinux>
1,automatic repaint when minimizing window,"i have a jframe, with two panels, in one panel...",<java><graphics><jframe><jpanel><paint>
2,is man-in-the-middle attack a security threat ...,"i am no expert in network security, so pardon ...",<security><ssh><ssh-keys><openssh><man-in-the-...
3,managing data access in a simple winforms app,i have a simple winforms data entry app that u...,<c#><winforms><sqlite><datatable><sqlconnection>
4,render basic html view?,i have a basic node.js app that i am trying to...,<javascript><html><node.js><mongodb><express>
5,how to use nested-subfloders routing in nodejs...,i am working on a node project using express. ...,<node.js><express><routes><nested-routes><modu...
6,using generics to process asp.net form request...,using jquery to post values back to an asp.net...,<c#><asp.net><generics><reflection><webforms>
7,simple select(*) queries very slow in apache i...,i'm prototyping apache ignite for use in a new...,<sql><performance><key-value><ignite><in-memor...
8,ms sql server optimizer and varying table and ...,we have a lot of queries for which we append a...,<sql><sql-server><sql-server-2005><tsql><query...
9,how to wait for process child?,i do the usual fork + exec combination:\nint s...,<linux><posix><exec><fork><wait>


In [9]:
df['Title'][37847]

"why 0/0 is nan but 0/0.00 isn't"

##### ... Saving the cleaned dataset ...

In [8]:
df.to_csv("data/cleaned/df_cleaned.csv", index= False)

##### ... cleaned dataset saved ...

#### 1.4 Analysis of the words frequency

In [None]:
df.describe(include=np.number)

In [None]:
df.describe(include=object)

In [None]:
df.Tags.value_counts()[:10]

In [None]:
tmp = df.Body.str.len()
sns.displot(tmp)

In [None]:
sns.boxplot(tmp)

In [None]:
df["_len_txt"] = tmp
df.head(10)

In [None]:
df.sort_values('_len_txt').head(10)

In [None]:
df.sort_values('_len_txt').tail(10)

In [None]:
df.drop(columns='_len_txt',inplace=True)
df.head(5)

### 2 Preliminary text processing analysis

#### 2.1 Tokenization selection

In [None]:
def display_tokens_info(tokens):
    """display info about corpus"""
    print(f"nb tokens {len(tokens)}, nb tokens uniques {len(set(tokens))}")
    print(tokens[:1000])

In [None]:
doc = df.Body.sample(1)
doc

In [None]:
doc = doc.values[0]
doc

In [None]:
txt_1 = f"tag : {df['Tags'][27287]}\n"
txt_2 = f"title : {df['Title'][27287]}\n"
print(txt_1+txt_2+df['Body'][27287]+"..."+"\n\n")

In [None]:
tokens_1 = word_tokenize(doc)
display_tokens_info(tokens_1)

In [None]:
tokens_2 = wordpunct_tokenize(doc)
display_tokens_info(tokens_2)

In [None]:
stop_words = set(nltk.corpus.stopwords.words('English'))

In [None]:
tokens_1 = [ w for w in tokens_1 if w not in stop_words]
display_tokens_info(tokens_1)

In [None]:
tokens_2 = [ w for w in tokens_2 if w not in stop_words]
display_tokens_info(tokens_2)

In [None]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
tokens_3 = tokenizer.tokenize(doc)
display_tokens_info(tokens_3)

In [None]:
tokens_3 = [ w for w in tokens_3 if w not in stop_words]
display_tokens_info(tokens_3)

#### 1.6 1st text processing function / test ...

In [None]:
def process_text(doc,
                 rejoin=False):

    #tokenizer
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    raw_tokens_list = tokenizer.tokenize(doc)

    # stop words:
    cleaned_tokens_list = [word for word in raw_tokens_list if word not in stop_words]

    if rejoin:
        return " ".join(cleaned_tokens_list)

    return cleaned_tokens_list

In [None]:
tokens_4 = process_text(doc)
display_tokens_info(tokens_4)

#### 2.3 Generalization of preliminary text processing to full corpus of answers

In [None]:
raw_corpus = "".join(df.Body.values)
raw_corpus[:1000]

In [None]:
corpus = process_text(raw_corpus, rejoin=False)
display_tokens_info(corpus)

#### 2.4 Preliminary Analysis of Frequency

In [None]:
tmp = pd.Series(corpus).value_counts()
tmp[:10]

In [None]:
tmp.tail(10)

In [None]:
sns.boxplot(tmp[tmp < 50])

In [None]:
list_unique_words = list((tmp[tmp == 1]).index)
len(list_unique_words)

##### ... saving the words appearing only once in the corpus ...

In [None]:
list_unique_words_df = pd.DataFrame({"words" : list_unique_words})
list_unique_words_df.to_csv("data/cleaned/unique_words.csv", index=False)
list_unique_words_df

In [None]:
list_min_5_words = list((tmp[tmp < 5]).index)

In [None]:
len(list_min_5_words )

In [None]:
list_min_5_words.sample(20)

##### ... saving the words apperaing 5 times or less in the corpus ...

In [None]:
list_min_5_words_df = pd.DataFrame({"words" : list_min_5_words})
list_min_5_words_df.to_csv("data/cleaned/min_5_words.csv", index=False)

In [None]:
list_min_10_words = list((tmp[tmp < 10]).index)
len(list_min_10_words)

##### ... saving the words appearing 10 times or less in the corpus ...

In [None]:
list_min_10_words_df = pd.DataFrame({"words" : list_min_10_words})
list_min_10_words_df.to_csv("data/cleaned/min_10_words.csv", index=False)

In [None]:
def process_text_2(doc,
                 rejoin=False,
                   list_rare_words = None,
                   min_len_word = 2,
                   force_is_alpha= False):
    #list_unique_words
    if not list_rare_words:
        list_rare_words = []

    #tokenizer
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    raw_tokens_list = tokenizer.tokenize(doc)

    # stop words:
    cleaned_tokens_list = [word for word in raw_tokens_list if word not in stop_words]

    #non_rare_tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]

    # no more than len
    more_than_N = [w for w in non_rare_tokens if len(w) >= min_len_word]

    #only alpha characters
    if force_is_alpha:
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
    else:
        alpha_tokens = more_than_N

    ################################################################################

    if rejoin:
        return " ".join(alpha_tokens)

    return alpha_tokens

In [None]:
display_tokens_info(corpus)

In [None]:
len(set(corpus))

In [None]:
corpus = process_text_2(raw_corpus, list_rare_words=list_unique_words,rejoin=False)
display_tokens_info(corpus)

In [None]:
tokens_3

In [None]:
trans_stemmer = nltk.PorterStemmer()
trans_text_stemmer = [trans_stemmer.stem(i) for i in tokens_3]
print(trans_text_stemmer)

In [None]:
len(trans_text_stemmer)

In [None]:
len(set(trans_text_stemmer))

In [None]:
trans_lemma = nltk.PorterStemmer()
trans_text_lemma = [trans_lemma.stem(i) for i in tokens_3]
print(trans_text_lemma)

In [None]:
len(trans_text_lemma)

In [None]:
len(set(trans_text_lemma))

In [None]:
def process_text_3(doc,
                    rejoin=False,
                    list_rare_words = None,
                    min_len_word = 2,
                    force_is_alpha= True,
                    lemm_or_stem = 'lem'):
    #list_unique_words
    if not list_rare_words:
        list_rare_words = []

    #tokenizer
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    raw_tokens_list = tokenizer.tokenize(doc)
    print(raw_tokens_list[:1])

    # stop words:
    cleaned_tokens_list = [word for word in raw_tokens_list if word not in stop_words]
    print(cleaned_tokens_list[:1])

    #non_rare_tokens
    non_rare_tokens = [w for w in cleaned_tokens_list if w not in list_rare_words]
    print(non_rare_tokens[:1])

    # no more than len
    more_than_N = [w for w in non_rare_tokens if len(w) >= min_len_word]
    print(more_than_N[:1])
    #only alpha characters
    if force_is_alpha:
        alpha_tokens = [w for w in more_than_N if w.isalpha()]
        print(alpha_tokens[:1])
    else:
        alpha_tokens = more_than_N

    if lemm_or_stem == 'lem' :
        trans= WordNetLemmatizer()
        trans_text = [trans.lemmatize(i) for i in alpha_tokens]
        print(trans_text[:1])
    else:
        trans = nltk.PorterStemmer()
        trans_text = [trans.stem(i) for i in alpha_tokens]

    ################################################################################

    if rejoin:
        return " ".join(trans_text)

    return trans_text

In [None]:
def final_clean(doc):
    new_doc = process_text_3(doc,
                           rejoin=False,
                           list_rare_words=list_unique_words,
                           min_len_word=2,
                           force_is_alpha=True,
                           )
    return new_doc

In [None]:
df['_clean_text']= df['Body'].parallel_apply(lambda x : final_clean(x))

In [None]:
corpus = process_text_3(corpus, list_rare_words=list_unique_words)

In [None]:
display_tokens_info(corpus)