# P5 : Catégorisez automatiquement des questions - Nettoyage et exploration des données

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\n_a_e\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\n_a_e\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importation et format des données 

In [2]:
data = pd.read_csv('QueryResults.csv')

In [3]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount
0,React-Native: Download Image from Firebase Sto...,<p><strong>Pre-Informations:</strong></p>\n\n<...,<javascript><image><firebase><react-native><fi...,39385219,5,12803,1,3
1,How to set Entity Framework Core migration tim...,<p>I'm using the latest (1.0.0) version of EF ...,<c#><entity-framework><entity-framework-core>,39006847,39,36435,3,6
2,gtk.StatusIcon and gtk.Menu on Windows,<p>I have a crossplatform app that has a gtk.S...,<gtk><pygtk>,1138891,4,1390,1,3
3,How to create shared package.json for multiple...,<p>I am building my app using separated micro ...,<node.js><npm>,39385269,14,12103,2,3
4,How to add draggable scrollbar to scrollview/l...,<p>I have a listView that uses a customListAda...,<android><android-listview><scrollbar>,18254370,9,6877,1,1


In [4]:
data.shape

(50000, 8)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          50000 non-null  object
 1   Body           50000 non-null  object
 2   Tags           50000 non-null  object
 3   Id             50000 non-null  int64 
 4   Score          50000 non-null  int64 
 5   ViewCount      50000 non-null  int64 
 6   FavoriteCount  50000 non-null  int64 
 7   AnswerCount    50000 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 3.1+ MB


In [6]:
data.isna().mean()

Title            0.0
Body             0.0
Tags             0.0
Id               0.0
Score            0.0
ViewCount        0.0
FavoriteCount    0.0
AnswerCount      0.0
dtype: float64

In [7]:
data['Title'] = data['Title'].astype(str)

In [8]:
data['Body'] = data['Body'].astype(str)

In [9]:
data['Tags'] = data['Tags'].astype(str)

In [10]:
data['Title'][:10]

0    React-Native: Download Image from Firebase Sto...
1    How to set Entity Framework Core migration tim...
2               gtk.StatusIcon and gtk.Menu on Windows
3    How to create shared package.json for multiple...
4    How to add draggable scrollbar to scrollview/l...
5     How to set order of cipher suites for Jetty SSL?
6    The TCP/IP connection to the host localhost, p...
7    Workaround for lack of line width on Windows w...
8    View doesn't start from bottom of status bar i...
9      show process's full command line in powershell?
Name: Title, dtype: object

In [11]:
data['Body'][:10]

0    <p><strong>Pre-Informations:</strong></p>\n\n<...
1    <p>I'm using the latest (1.0.0) version of EF ...
2    <p>I have a crossplatform app that has a gtk.S...
3    <p>I am building my app using separated micro ...
4    <p>I have a listView that uses a customListAda...
5    <p>For serving my application I use Apache or ...
6    <p>I'm getting this 2 errors repeatedly when I...
7    <p>I'm trying to draw a rotating 3D coordinate...
8    <p>When updating my app for iOS7, the UI is ap...
9    <p>When I run this command</p>\n\n<p><code>PS ...
Name: Body, dtype: object

In [12]:
data['Tags'][:10]

0    <javascript><image><firebase><react-native><fi...
1        <c#><entity-framework><entity-framework-core>
2                                         <gtk><pygtk>
3                                       <node.js><npm>
4               <android><android-listview><scrollbar>
5                                   <java><ssl><jetty>
6                    <java><sql-server><eclipse><jdbc>
7                        <javascript><three.js><webgl>
8                     <iphone><ios><objective-c><ios7>
9                                         <powershell>
Name: Tags, dtype: object

## Nettoyage des données

### Etapes communes

#### Suppression des duplicats

In [14]:
print(data.shape)
data.drop_duplicates(subset=['Title'], inplace =True)
data.drop_duplicates(subset=['Body'], inplace =True)
print(data.shape)

(50000, 8)
(49997, 8)


In [29]:
data_c = data.copy()

In [30]:
data_c.shape

(49997, 8)

#### Suppression des balises html

In [31]:
def sup_balise(sentence) :
    return BeautifulSoup(sentence).get_text()

In [32]:
data_c["Title"] = data_c["Title"].apply(sup_balise)
data_c["Title"].head()

0    React-Native: Download Image from Firebase Sto...
1    How to set Entity Framework Core migration tim...
2               gtk.StatusIcon and gtk.Menu on Windows
3    How to create shared package.json for multiple...
4    How to add draggable scrollbar to scrollview/l...
Name: Title, dtype: object

In [33]:
data_c["Body"] = data_c["Body"].apply(sup_balise)
data_c["Body"].head()

0    Pre-Informations:\nI still have a Firebase pro...
1    I'm using the latest (1.0.0) version of EF Cor...
2    I have a crossplatform app that has a gtk.Stat...
3    I am building my app using separated micro ser...
4    I have a listView that uses a customListAdapte...
Name: Body, dtype: object

#### Netoyage des tags

In [37]:
data_c['Tags'] = data_c.apply(lambda r :r['Tags'].replace('<', ' ').replace('>', ' ') , axis=1)

In [38]:
data_c['Tags'].head()

0     javascript  image  firebase  react-native  fi...
1         c#  entity-framework  entity-framework-core 
2                                          gtk  pygtk 
3                                        node.js  npm 
4                android  android-listview  scrollbar 
Name: Tags, dtype: object

### Tokénisation

In [101]:
def tokenizer_fct(sentence) :
    sentence_clean = sentence.replace('-', ' ').replace('<', ' ').replace('>', ' ').replace(
        '.', ' ').replace('#', ' ').replace('/', ' ').replace("'", '')
    word_tokens = word_tokenize(sentence_clean)
    return word_tokens

In [102]:
data["Title_t"] = data["Title"].apply(tokenizer_fct)
data["Title_t"].head()

0    [C, Generics, wont, allow, Delegate, Type, Con...
1    [How, do, you, get, a, directory, listing, in,...
2                [Returning, DataTables, in, WCF, NET]
3    [How, to, create, a, DataFrame, from, a, text,...
4    [Whats, the, difference, between, struct, and,...
Name: Title_t, dtype: object

In [103]:
data["Body_t"] = data["Body"].apply(tokenizer_fct)
data["Body_t"].head()

0    [p, Is, it, possible, to, define, a, class, in...
1    [p, How, do, you, scan, a, directory, for, fol...
2    [p, I, have, a, WCF, service, from, which, I, ...
3    [p, I, have, a, text, file, on, HDFS, and, I, ...
4    [p, Whats, the, difference, between, struct, a...
Name: Body_t, dtype: object

In [104]:
data["Tags_t"] = data["Tags"].apply(tokenizer_fct)
data["Tags_t"].head()

0        [c, generics, events, delegates, constraints]
1    [c, file, directory, cross, platform, common, ...
2              [c, net, wcf, web, services, datatable]
3    [scala, apache, spark, dataframe, apache, spar...
4    [net, class, struct, value, type, reference, t...
Name: Tags_t, dtype: object

### Normalisation

#### Suppression des mots vides

In [105]:
stop_w = list(set(stopwords.words('english'))) + ['[', ']', ',', '.', ':', '?', '(', ')', '/', '<', '>', '#', '-', '_', "''"]

def stop_word_filter(list_words) :
    filtered_w1 = [w for w in list_words if not w in stop_w]
    filtered_w2 = [w for w in filtered_w1 if len(w) > 2]
    return filtered_w2

In [106]:
data["Title_w"] = data["Title_t"].apply(stop_word_filter)
data["Title_w"].head()

0    [Generics, wont, allow, Delegate, Type, Constr...
1                       [How, get, directory, listing]
2                    [Returning, DataTables, WCF, NET]
3          [How, create, DataFrame, text, file, Spark]
4              [Whats, difference, struct, class, NET]
Name: Title_w, dtype: object

In [107]:
data["Body_w"] = data["Body_t"].apply(stop_word_filter)
data["Body_w"].head()

0    [possible, define, class, pre, code, class, Ge...
1    [How, scan, directory, folders, files, needs, ...
2    [WCF, service, want, return, DataTable, know, ...
3    [text, file, HDFS, want, convert, Data, Frame,...
4              [Whats, difference, struct, class, NET]
Name: Body_w, dtype: object

In [110]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Title_t,Body_t,Tags_t,Title_w,Body_w
0,C# Generics won't allow Delegate Type Constraints,<p>Is it possible to define a class in C# such...,<c#><generics><events><delegates><constraints>,191940,81,29837,12,8,"[C, Generics, wont, allow, Delegate, Type, Con...","[p, Is, it, possible, to, define, a, class, in...","[c, generics, events, delegates, constraints]","[Generics, wont, allow, Delegate, Type, Constr...","[possible, define, class, pre, code, class, Ge..."
1,How do you get a directory listing in C?,<p>How do you scan a directory for folders and...,<c><file><directory><cross-platform><common-ta...,12489,69,139075,22,9,"[How, do, you, get, a, directory, listing, in,...","[p, How, do, you, scan, a, directory, for, fol...","[c, file, directory, cross, platform, common, ...","[How, get, directory, listing]","[How, scan, directory, folders, files, needs, ..."
2,Returning DataTables in WCF/.NET,<p>I have a WCF service from which I want to r...,<c#><.net><wcf><web-services><datatable>,12702,51,56460,12,8,"[Returning, DataTables, in, WCF, NET]","[p, I, have, a, WCF, service, from, which, I, ...","[c, net, wcf, web, services, datatable]","[Returning, DataTables, WCF, NET]","[WCF, service, want, return, DataTable, know, ..."
3,How to create a DataFrame from a text file in ...,<p>I have a text file on HDFS and I want to co...,<scala><apache-spark><dataframe><apache-spark-...,36766322,21,168246,12,8,"[How, to, create, a, DataFrame, from, a, text,...","[p, I, have, a, text, file, on, HDFS, and, I, ...","[scala, apache, spark, dataframe, apache, spar...","[How, create, DataFrame, text, file, Spark]","[text, file, HDFS, want, convert, Data, Frame,..."
4,What's the difference between struct and class...,<p>What's the difference between struct and cl...,<.net><class><struct><value-type><reference-type>,13049,847,471969,297,19,"[Whats, the, difference, between, struct, and,...","[p, Whats, the, difference, between, struct, a...","[net, class, struct, value, type, reference, t...","[Whats, difference, struct, class, NET]","[Whats, difference, struct, class, NET]"
