## <u>Libraries</u>

In [14]:
import numpy as np 
import pandas as pd

import wptools
import wikipedia
 
import string 
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

import warnings
warnings.filterwarnings("ignore")

<div class='alert alert-success'><h1> Part 1 : Corpus Extraction</h1></div>

In [15]:
def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

**person_info** retrieves from the wd of each person : title, description, page content(text). In case, the page does not exist, the exception PageError is raised in addition to DisambiguationError when there are similar names to the person. When it is the case, the function returns an empty dictionary as it will so in case there is less than n sentences in the page content. 

In [16]:
def person_info(wd, n=5): 
    '''
        retrieve title and description of each human from its wd 
    '''
    person = {}
    
    # get wikidata 
    page = wptools.page(wikibase=wd)
    page.get_wikidata()
    
    # add informations to dict person 
    person['wd'] = wd 
    person['title'] = page.data['title']
    person['description'] = page.data['description'] 
    
    
    # get content 
    try:
        wikipedia.set_lang("en")
        page = wikipedia.page(person['title'])
    except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
        #if a PageError and DisambiguationError are raised don't return {}
        return {}
    
    # add informations to dict person
    from nltk.tokenize import sent_tokenize
    #text = text.replace("\n","")
    sentences = sent_tokenize(page.content)   # sentence tokenization 
    
    # in case there is less than n sentences 
    if len(sentences) < n:
        return {}
    
    # otherwise 
    page_content = ' '.join(sentences[:n])
    person['text'] = page_content                       
    
    return person 

In [17]:
# Test person_info() 
#person_info('Q27072')

**cat_list**

In [18]:
def cat_list(l, cat_name, k):
    '''
        retrieve all the persons according to the cat_name from the corresponding list of wd l 
    '''

    category = []
    i = 0 
    while len(category) < k and i < len(l):
        
        person_wd = l[i]
        person = person_info(person_wd)  # dictionary of person's info 
        i += 1 
        
        # if the person does not exist 
        if person == {}: 
            continue
        
        person['cat'] = cat_name[:-1]   #remove the 's' at the end  
        category.append(person)     
            
    return category

In [19]:
# Test cat_dict
#L = ["Q23696","Q27072","Q4616","Q43323"]
#cat_dict(L,"mathematicians")

**list_wd_category**

In [20]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [21]:
def list_wd_category(wd_cat, K=100):
    '''
        K : number of wd retrieved (to be maximized)
        returns a list of wd for each person from the wd of a category 
    '''
    from SPARQLWrapper import SPARQLWrapper, JSON
    endpoint_url = "https://query.wikidata.org/sparql"

    query = """SELECT DISTINCT ?item ?itemLabel ?itemDescription  WHERE {{
      ?item wdt:P31 wd:Q5.
      ?item wdt:P106 wd:{}
        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],fr,en" }}

    }}
    LIMIT {}""".format(wd_cat,K)

    results = get_results(endpoint_url, query)
    
    list_wd = []
    for result in results["results"]["bindings"]:
        list_wd.append(result['item']['value'][31:])   # wd 
    
    return list_wd

In [22]:
# Test list_wd_category
#list_wd_category("Q170790") #mathematicians 

**type_data**

In [23]:
def type_data(type_dic, type_letter, k=30):
    '''
        create a dataframe of persons from their types and categories
        type_dic : dictionnary of {cat:wd}
    '''
    type_data = []
    for category, wd in type_dic.items(): 
        L = list_wd_category(wd)   # retrieves a list of wds 
        catList = cat_list(L, category, k)
        
        # add the type letter 
        for d in catList: 
            d['type'] = type_letter
        
        # retrieves a list of dict for each person with its corresponding infos
        type_data.extend(catList)   
        print(">>>>> {} infos have been retrieved !".format(category))
        
    return type_data

In [24]:
artists = {"painters":"Q1028181", "singers":"Q177220", "writers":"Q36180"}
non_artists = {"mathematicians":"Q170790", "architects":"Q42973", "politicians":"Q82955"}

In [25]:
# Test type_data
#artists_data = type_data(type_dic=non_artists, type_letter='Z', k=2)

In [26]:
artists_data = type_data(type_dic=artists, type_letter='A', k=30)
artists_df = pd.DataFrame(artists_data, columns=['title','wd','type','cat','description','text'])
artists_df.head()

www.wikidata.org (wikidata) Q7803
www.wikidata.org (labels) P691|Q1131476|P18|P8406|P6844|Q8989076|...
www.wikidata.org (labels) Q555015|Q2044|P949|P1273|Q842858|P5271|...
www.wikidata.org (labels) P3762|P735|P2342|P973|Q671384|P271|P322...
www.wikidata.org (labels) P8750|Q163804|Q6581097|Q214867|P7444|P4...
www.wikidata.org (labels) P373|Q20729893|P268|P937|Q705551|Q21008...
en.wikipedia.org (imageinfo) File:Alessandro Allori, santissima t...
Bronzino (en) data
{
  aliases: <list(54)> Agnolo di Cosimo, Angelo Bronzino, Angelo Al...
  claims: <dict(137)> P1066, P910, P21, P373, P244, P214, P227, P1...
  description: Italian Mannerist painter
  image: <list(1)> {'file': 'File:Alessandro Allori, santissima tr...
  label: Bronzino
  labels: <dict(213)> P691, Q1131476, P18, P8406, P6844, Q8989076,...
  modified: <dict(1)> wikidata
  requests: <list(7)> wikidata, labels, labels, labels, labels, la...
  title: Bronzino
  what: human
  wikibase: Q7803
  wikidata: <dict(137)> student of (P1066

>>>>> painters infos have been retrieved !


www.wikidata.org (wikidata) L488307


LookupError: wikidata item L488307 has been deleted

In [None]:
non_artists_data = type_data(type_dic=non_artists, type_letter='Z', k=30)
non_artists_df = pd.DataFrame(non_artists_data, columns=['title','wd','type','cat','description','text'])
non_artists_df.head()

**dataframe**

In [None]:
data = pd.concat([artists_df,non_artists_df], axis=0)
data.head()

In [None]:
data.to_csv('data.csv')

<div class='alert alert-success'><h1> Part 2 : Pre-processing, Clustering and Classifying</h1></div>

#### Pre-processing

In [None]:
data = pd.read_csv('data.csv', index_col=0)
data.head()

**Contractions to expantions**

In [None]:
!pip3 install contractions 

In [None]:
#https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/
import contractions
def cont2expand(text):
    '''
        convert contractions in a text into expantions 
    '''
    expanded_words = []    
    for word in text.split():
        expanded_words.append(contractions.fix(word))   
    expanded_text = ' '.join(expanded_words)
    return expanded_text

**Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer
def lemmatization(tokens):
    '''
        lemmatize tokens and returns a list of lemmatized tokens 
    '''
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
def preprocessing(text):
    '''
        Preprocess text (string) : Remove Punctuation, Tokenization, Lowercase, Remove Stopwords, Contractions to Expantions 
    '''
    
    # remove punctuation except '-'
    punc = string.punctuation.replace('-','') 
    text = "".join([char for char in text if char not in punc])
    
    # Contractions > Expantions 
    text = cont2expand(text)
    
    # tokenization 
    tokens = nltk.word_tokenize(text)
    
    # lowercase tokens  
    tokens = [x.lower() for x in tokens]
    
    # remove stopwords 
    tokens = [x for x in tokens if not x in stopwords.words('english')]
    
    # lemmatization 
    tokens = lemmatization(tokens)
    
    return " ".join(tokens)

In [None]:
%%time 

person = data['title']
wikipedia = data['text']
wikipedia_after_pre = data['text'].apply(preprocessing)  # preprocessing 
wikidata = data['description']
wikidata_after_pre = data['description'].apply(preprocessing)  # preprocessing
wiki_combined = df['Wikipedia_cleaned'].astype(str) + ' ' + df['Wikidata_cleaned'].astype(str)

In [None]:
dic = {'Person':person,
       'Wikipedia':wikipedia,
       'Wikipedia_cleaned':wikipedia_after_pre,
       'Wikidata':wikidata,
       'Wikidata_cleaned':wikidata_after_pre,
       'Wiki_combined':wiki_combined}
df = pd.DataFrame(dic)

**Language Detection**

In [None]:
!pip3 install langdetect

In [None]:
from langdetect.lang_detect_exception import LangDetectException

def languages_detection(text):
    '''
        extract all languages in a text 
    '''
    from langdetect import detect
    tokens = nltk.word_tokenize(text)
    languages = []
    for token in tokens: 
        try: 
            languages.append(detect(token))
        except LangDetectException as e: 
            pass 
            
    return languages

In [None]:
text = "العَرَبِيَّة 中文 Ein zwei drei vier"
languages_detection(text)

In [None]:
Languages = df['Wikipedia_cleaned'].apply(languages_detection)

**Country Detection**

In [None]:
!pip3 install locationtagger 
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
import locationtagger
def country_detection(text):
    '''
        retrieves a list of countries from cities, regions and country mentions 
    '''
    countries = []
    entities = locationtagger.find_locations(text = text)
    countries.extend(entities.country_cities.keys())
    countries.extend(entities.countries)
    
    return list(set(countries))

In [None]:
text = "Unlike India and Japan, A winter weather advisory remains in effect through 5 PM along and east of a line from Blue Earth, to Red Wing line in Minnesota and continuing to along an Ellsworth, to Menomonie, and Chippewa Falls line in Wisconsin."
country_detection(text)

In [None]:
Countries = df['Wikipedia_cleaned'].apply(country_detection)

**Named Entity Recognition**

In [None]:
def named_entities(text):
    '''
        extracts entity_names from a text
    '''
    doc = nlp(str(text))
    entities_names = []
    
    for ent in doc.ents:
        entities_names.append(ent.label_)   # extract entity names 

    return entities_names

In [None]:
def entities(text):
    '''
        extracts entities from a text
    '''
    doc = nlp(str(text))
    entities = []
    
    for ent in doc.ents:
        entities.append(ent.text)  # extract entities 

    return entities

In [None]:
Entity_Names = data['text'].apply(named_entities)
Entities = data['text'].apply(entities)

**Dependency Parsing**

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
def dependency_graph(text):
    dependencies = []

    doc = nlp(text)
    all_two_treelet = ""
    for token in doc:
        two_treelet = str(token.pos_) + " -> " + str(token.dep_) + " -> " + str(token.head.pos_)
        all_two_treelet += two_treelet + " | "
    dependencies.append(all_two_treelet)
    return pd.Series(dependencies)

In [None]:
%%time 
Dependency_Graph = data['description'].apply(dependency_graph)
Dependency_Graph
# vectorization with tfidf 

**POS Tags**

In [None]:
def pos_tags(text):
    '''
        create a dictionary of tags and their occurrences 
    '''
    from collections import Counter
    pos_tags = []
    
    doc = nlp(text)
    for token in doc:
        pos_tags.append(str(token.pos_))
    
    return pos_tags

In [None]:
POS_TAGS = data['text'].apply(pos_tags)

**3-Grams**

In [None]:
# code from : https://stackoverflow.com/questions/17531684/n-grams-in-python-four-five-six-grams
def get_ngrams(text, n=3):
    n_grams = ngrams(word_tokenize(text), n)
    return [' '.join(grams) for grams in n_grams]

In [None]:
get_ngrams('William Blake (28 November 1757 – 12 August 18', 3)

In [None]:
Tri_Grams = data['text'].apply(lambda x:get_ngrams(x, 3))
Tri_Grams

**Gather everything**

In [None]:
dic = {'Person':person,
       'Wikipedia':wikipedia,
       'Wikipedia_cleaned':wikipedia_after_pre,
       'Wikidata':wikidata,
       'Wikidata_cleaned':wikidata_after_pre,
       'Wiki_combined':wiki_combined,
       'Entity_Names': Entity_Names,
       'Entities': Entities,
       'Countries': Countries,
       'Languages': Languages,
       'POS_Tags' : POS_TAGS,
       'Tri_Grams': Tri_Grams}

In [None]:
df = pd.DataFrame(dic)
df.head()

In [None]:
df.to_csv('information_extracted.csv') #without dependency graph 

<div class='alert alert-success'><h1> Part 3 : Clustering </h1></div>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

## Vectorize Textual Data (String)

In [None]:
def vectorizer(corpus, method):
    vectorizer = []
    vectors = []
    if method.lower() == 'tokens':
        vectorizer = CountVectorizer(binary=True)
        vectors = vectorizer.fit_transform(corpus)
        
    if method.lower() =='tokens frequency' or method =='bow':
        vectorizer = CountVectorizer()
        vectors = vectorizer.fit_transform(corpus)
    
    if method == 'tfidf':
        vectorizer  = TfidfVectorizer(max_features=8000,
                                       use_idf=True,
                                       stop_words='english',
                                       tokenizer=nltk.word_tokenize,
                                       ngram_range=(1, 3))
        vectors = vectorizer.fit_transform(corpus)
    return pd.DataFrame(data= vectors.toarray(),columns=vectorizer.vocabulary_.keys())

**Vectorization of Wikipedia_cleaned and Wikidata_cleaned with BOW method**

In [None]:
# Method : Tokens Frequency (BOW) 
wikipedia_bow = vectorizer(df['Wikipedia_cleaned'], method='bow'); print("wikipedia_bow shape : ", wikipedia_bow.shape)
wikidata_bow = vectorizer(df['Wikidata_cleaned'], method='bow'); print("wikidata_bow shape : ", wikidata_bow.shape)

features_bow = pd.concat([wikipedia_bow,wikidata_bow], axis=1)
features_bow.head()

**Vectorizations of the combinaison : Wikipedia_cleaned and Wikidata_cleaned**

In [None]:
wiki_t = vectorizer(df['Wiki_combined'], method='tokens'); print("wiki_t shape : ", wiki_t.shape)
wiki_bow = vectorizer(df['Wiki_combined'], method='bow'); print("wiki_bow shape : ", wiki_bow.shape)
wiki_tfidf = vectorizer(df['Wiki_combined'], method='tfidf'); print("wiki_tfidf shape : ", wiki_tfidf.shape)

## Vectorize lists of Text

In [None]:
def list_to_bow(s):
    '''
        s : pd.series 
        Takes a pd.Series of lists (of keywords) and return a bag of words of it
    '''
    from collections import Counter
    
    # gather all keywords in one single list 
    keywords = []
    for row in s: 
        keywords.extend(row)
    keywords = list(set(keywords))  #unique values 
    
    # frequence of each keywords 
    features = pd.DataFrame(np.nan, index=range(0,len(s)), columns=keywords) # empty dataframe (persons x keywords)
    for i, row in enumerate(s): 
        row_count = dict(Counter(row))
        for keyword in keywords: 
            if keyword not in row_count.keys():   # row_count.keys() = list of keywords per person 
                features[keyword][i] = 0
            else: 
                features[keyword][i] = row_count[keyword]
    
    return features.astype(int)

**NER Vectorization**

In [None]:
ner_vect = list_to_bow(df['Entity_Names'])
ner_vect.head()

In [None]:
pd.DataFrame(ner_vect.sum().sort_values(ascending=False)).plot.bar()

**POS-Tags Vectorization**

In [None]:
pos_vect = list_to_bow(df['POS_Tags'])
pos_vect.head()

In [None]:
import seaborn as sns 
pd.DataFrame(pos_vect.sum().sort_values(ascending=False)).plot.bar(color='yellow')

**Countries Vectorization**

In [None]:
countries_vect = list_to_bow(df['Countries'])
countries_vect.head()

In [None]:
pd.DataFrame(countries_vect.sum().sort_values(ascending=False))[:30].plot.bar(color='cyan')

**Languages Vectorization**

In [None]:
lang_vect = list_to_bow(df['Languages'])
lang_vect.head()

In [None]:
pd.DataFrame(lang_vect.sum().sort_values(ascending=False))[:30].plot.bar(color='m')

## Clustering 

In [None]:
def clustering(X, nbr_clusters):
    #Check the input representation method needed
    #vectors = vectorizer(data,method)
    
    # Create a KMeans clustering model
    km = KMeans(n_clusters=nbr_clusters, 
                init='k-means++', 
                max_iter=300, 
                n_init=5, 
                verbose=0, 
                random_state=3425)
    km.fit(X) # Apply the clustering model
    y_pred = km.labels_  # Find out the predicted labels

    return y_pred

In [None]:
def intrinsic_metrics(vectors, predicted_values):
    # When no ground truth is available
    return {'Silouhette Coefficient':metrics.silhouette_score(vectors, predicted_values, sample_size=1000)}

In [None]:
def extrinsic_metrics(predicted_values, true_values):
    #When we have ground truth
    homogeneity = metrics.homogeneity_score(true_values, predicted_values)
    completeness = metrics.completeness_score(true_values, predicted_values)
    v_mesure = metrics.v_measure_score(true_values, predicted_values)
    adjust_rand_index = metrics.adjusted_rand_score(true_values, predicted_values)

    return {'Homogeneity':homogeneity,'Completeness':completeness,'V-Mesure':v_mesure,'Adjuste Rand Index':adjust_rand_index}

In [None]:
def visualization(intrinsic_measure, extrinsic_measure,method):
    #Extraction of labels and values
    labels = list(extrinsic_measure.keys())+list(intrinsic_measure.keys())
    values = list(extrinsic_measure.values())+list(intrinsic_measure.values())

    sns.barplot( x=labels, y=values)
    plt.xlabel('metrics values')
    plt.title('{} representation'.format(method))
    plt.show()

## Methods

**Target Encoding**

In [None]:
y = data['type']
y.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = data['type']
y_test = pd.Series(encoder.fit_transform(y))
y_test.value_counts()

**Method 1 : Wikipedia (BOW)**

In [None]:
# Features 
features_1 = wikipedia_bow

In [None]:
# Training & Predictions 
ypred_1 = clustering(X=features_1, nbr_clusters=2)

In [None]:
# Evaluation
method_1 = extrinsic_metrics(ypred_1, y_test)
method_1.update(intrinsic_metrics(features_1, ypred_1))
method_1['Method'] = 'Wikipedia (BOW)'
method_1

**Method 2 : Wikidata (BOW)**

In [None]:
# Features
features_2 = wikidata_bow

In [None]:
# Training 
ypred_2 = clustering(X=features_2, nbr_clusters=2)

In [None]:
# Evaluation
method_2 = extrinsic_metrics(ypred_2, y_test)
method_2.update(intrinsic_metrics(features_2, ypred_2))
method_2['Method'] = 'Wikidata (BOW)'
method_2

**Method 3 : Wikipedia + Wikidata (Tokens)**

In [None]:
# Features 
features_3 = wiki_t

In [None]:
# Training 
ypred_3 = clustering(X=features_3, nbr_clusters=2)

In [None]:
# Evaluation
method_3 = extrinsic_metrics(ypred_3, y_test)
method_3.update(intrinsic_metrics(features_3, ypred_3))
method_3['Method'] = 'Wikipedia + Wikidata (Tokens)'
method_3

**Method 4 : Wikipedia + Wikidata (BOW)**

In [None]:
# Features 
features_4 = wiki_bow

In [None]:
# Training 
ypred_4 = clustering(X=features_4, nbr_clusters=2)

In [None]:
# Evaluation
method_4 = extrinsic_metrics(ypred_4, y_test)
method_4.update(intrinsic_metrics(features_4, ypred_4))
method_4['Method'] = 'Wikipedia + Wikidata (BOW)'
method_4

**Method 5 : Wikipedia + Wikidata (TFIDF)**

In [None]:
# Features 
features_5 = wiki_tfidf

In [None]:
# Training 
ypred_5 = clustering(X=features_5, nbr_clusters=2)

In [None]:
# Evaluation
method_5 = extrinsic_metrics(ypred_5, y_test)
method_5.update(intrinsic_metrics(features_5, ypred_5))
method_5['Method'] = 'Wikipedia + Wikidata (TF-IDF)'
method_5

**Method 6 : Wikipedia + Wikidata + NER (TFIDF)**

In [None]:
# Features 
features_6 = pd.concat([wiki_tfidf, ner_vect], axis=1)

In [None]:
# Training 
ypred_6 = clustering(X=features_6, nbr_clusters=2)

In [None]:
# Evaluation
method_6 = extrinsic_metrics(ypred_6, y_test)
method_6.update(intrinsic_metrics(features_6, ypred_6))
method_6['Method'] = 'Wikipedia + Wikidata + NER '
method_6

**Method 7 : NER + POS-Tags**

In [None]:
# Features
features_7 = pd.concat([ner_vect, pos_vect], axis=1)

In [None]:
# Training 
ypred_7 = clustering(X=features_7, nbr_clusters=2)

In [None]:
# Evaluation
method_7 = extrinsic_metrics(ypred_7, y_test)
method_7.update(intrinsic_metrics(features_7, ypred_7))
method_7['Method'] = 'NER + POS-TAGS'
method_7

**Method 8 : Wikipedia + Wikidata + NER + POS-Tags + Languages + Countries**

In [None]:
# Features
features_8 = pd.concat([wiki_tfidf, ner_vect, pos_vect, lang_vect, countries_vect], axis=1)

In [None]:
ypred_8 = clustering(X=features_8, nbr_clusters=2)

In [None]:
# Evaluation
method_8 = extrinsic_metrics(ypred_8, y_test)
method_8.update(intrinsic_metrics(features_8, ypred_8))
method_8['Method'] = 'Wikipedia + Wikidata + NER + POS-TAGS + Languages + Countries'
method_8

**Method 9 : NER**

In [None]:
# Features
features_9 = ner_vect

In [None]:
# Training 
ypred_9 = clustering(X=features_9, nbr_clusters=2)

In [None]:
# Evaluation
method_9 = extrinsic_metrics(ypred_9, y_test)
method_9.update(intrinsic_metrics(features_9, ypred_9))
method_9['Method'] = 'NER'
method_9

**Summary**

In [None]:
#For more readability, we create a dataframe
summary = pd.DataFrame([method_1, method_2, method_3, method_4, method_5, method_6, method_7, method_8, method_9], 
                       columns=['Method','Silouhette Coefficient','Homogeneity','Completeness','V-Mesure','Adjuste Rand Index'])
summary.sort_values(by=['Silouhette Coefficient'], ascending=False)

## Clustering with 6 clusters 

In [None]:
from sklearn.preprocessing import LabelEncoder
#Here we define our test set
encoder = LabelEncoder()
y = data['cat']
y_test = pd.Series(encoder.fit_transform(y))
y_test.value_counts()

In [None]:
# Training & Predictions 
ypred_1 = clustering(X=features_1, nbr_clusters=6)
# Evaluation
method_1 = extrinsic_metrics(ypred_1, y_test)
method_1.update(intrinsic_metrics(features_1, ypred_1))
method_1['Method'] = 'Wikipedia (BOW)'
#method_1

# Training 
ypred_2 = clustering(X=features_2, nbr_clusters=6)
# Evaluation
method_2 = extrinsic_metrics(ypred_2, y_test)
method_2.update(intrinsic_metrics(features_2, ypred_2))
method_2['Method'] = 'Wikidata (BOW)'
#method_2

# Training 
ypred_3 = clustering(X=features_3, nbr_clusters=6)
# Evaluation
method_3 = extrinsic_metrics(ypred_3, y_test)
method_3.update(intrinsic_metrics(features_3, ypred_3))
method_3['Method'] = 'Wikipedia + Wikidata (Tokens)'
#method_3

# Training 
ypred_4 = clustering(X=features_4, nbr_clusters=6)
# Evaluation
method_4 = extrinsic_metrics(ypred_4, y_test)
method_4.update(intrinsic_metrics(features_4, ypred_4))
method_4['Method'] = 'Wikipedia + Wikidata (BOW)'
#method_4

# Training 
ypred_5 = clustering(X=features_5, nbr_clusters=6)
# Evaluation
method_5 = extrinsic_metrics(ypred_5, y_test)
method_5.update(intrinsic_metrics(features_5, ypred_5))
method_5['Method'] = 'Wikipedia + Wikidata (TF-IDF)'
#method_5

# Training 
ypred_6 = clustering(X=features_6, nbr_clusters=6)
# Evaluation
method_6 = extrinsic_metrics(ypred_6, y_test)
method_6.update(intrinsic_metrics(features_6, ypred_6))
method_6['Method'] = 'Wikipedia + Wikidata + NER '
#method_6

# Training 
ypred_7 = clustering(X=features_7, nbr_clusters=6)
# Evaluation
method_7 = extrinsic_metrics(ypred_7, y_test)
method_7.update(intrinsic_metrics(features_7, ypred_7))
method_7['Method'] = 'Wikipedia + Wikidata + NER + POS-TAGS'
#method_7

ypred_8 = clustering(X=features_8, nbr_clusters=6)
# Evaluation
method_8 = extrinsic_metrics(ypred_8, y_test)
method_8.update(intrinsic_metrics(features_8, ypred_8))
method_8['Method'] = 'Wikipedia + Wikidata + NER + POS-TAGS + Languages + Countries'
#method_8

# Features
features_9 = ner_vect
# Training 
ypred_9 = clustering(X=features_9, nbr_clusters=6)
# Evaluation
method_9 = extrinsic_metrics(ypred_9, y_test)
method_9.update(intrinsic_metrics(features_9, ypred_9))
method_9['Method'] = 'NER'
#method_9

**Summary**

In [None]:
summary = pd.DataFrame([method_1, method_2, method_3, method_4, method_5, method_6, method_7, method_8, method_9], 
                       columns=['Method','Silouhette Coefficient','Homogeneity','Completeness','V-Mesure','Adjuste Rand Index'])
summary.sort_values(by=['Silouhette Coefficient'], ascending=False)

<div class='alert alert-success'><h1> Part 4 : Classification</h1></div>

## Models

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
#Definition of perceptron function. It takes X_train, y_train, X_test,y_test and return  accuracy, confision matrix and 
#a report that contains recall,precision,f1_score
def perceptron(X_train, y_train, X_test,y_test):
    per_clf = Perceptron(max_iter=5, tol=None)
    # Train the model on the training data
    per_clf.fit(X_train, y_train)

    # Test the model on the test data
    y_pred = per_clf.predict(X_test)
    #Accuracy
    acc = accuracy_score(y_test, y_pred)
    #Confusion Matrix
    mat = confusion_matrix(y_test, y_pred)
    #Classification report
    report = classification_report(y_test, y_pred)
    return acc,report,mat

In [None]:
#Definition of random forest function. It takes X_train, y_train, X_test,y_test and return  accuracy, confision matrix and 
#a report that contains recall,precision,f1_score
def random_forest(X_train, y_train, X_test,y_test):

    rf_clf = RandomForestClassifier(n_estimators=100)
    # Train the model on the training data
    rf_clf.fit(X_train, y_train)
    # Test the model on the test data
    y_pred = rf_clf.predict(X_test)
    #Accuracy
    acc = accuracy_score(y_test, y_pred)
    #Confusion Matrix
    mat = confusion_matrix(y_test, y_pred)
    #Classification report
    report = classification_report(y_test, y_pred)
    return acc,report,mat

In [None]:
#SVM model
def svm(X_train, y_train, X_test, y_test):

    svm = SVC()
    # Train the model on the training data
    svm.fit(X_train,y_train)
    # Test the model on the test data
    y_pred = svm.predict(X_test)
    #Accuracy
    acc = accuracy_score(y_test, y_pred)
    #Confusion Matrix
    mat = confusion_matrix(y_test, y_pred)
    #Classification report
    report = classification_report(y_test, y_pred)
    return acc,report,mat

## Categories Classification (Binary Classification)

In [None]:
# use of previous features
from sklearn.model_selection import train_test_split

# Encoding the target using label encoder from sklearn
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = data['type']
y = pd.Series(encoder.fit_transform(y))

# Split the data : Train set & Test Set 
f = [features_1, features_2, features_3, features_4, features_5, features_6, features_7, features_8, features_9]
features = []
for feature in f: 
    X_train, X_test, y_train, y_test = train_test_split(feature, y, test_size=0.33, random_state=42)
    features.append([X_train, X_test, y_train, y_test])

In [None]:
#Here the goal is to make a loop to test all data representation method. We also classification function intanciate above
methods = {}
reports = {}
confusion_matrices = {}
for i, feature in enumerate(features): 
    #Defining train and test data
    X_train, X_test, y_train, y_test = feature[0], feature[1], feature[2], feature[3]
    #Retrieve results return by perceptron classifier
    accuracy_perceptron, report_perceptron, confusion_mat_perceptron = perceptron(X_train, y_train, X_test, y_test)
    #Retrieve results return by Random Forest classifier
    accuracy_rf, report_rf, confusion_mat_rf = random_forest(X_train, y_train, X_test, y_test)
    #Retrieve results return by SVM
    accuracy_svm, report_svm, confusion_mat_svm = svm(X_train, y_train, X_test, y_test)
    #Here we get all results returned by our classifiers
    methods['method_' + str(i+1)] = [accuracy_perceptron, accuracy_rf, accuracy_svm]
    reports['method_' + str(i+1)] = [report_perceptron, report_rf, report_svm]
    confusion_matrices['method_' + str(i)] = [confusion_mat_perceptron, confusion_mat_rf, confusion_mat_svm]

In [None]:
#To have a better presentation of results we build a dataframe 
summary_bclass = pd.DataFrame.from_dict(methods, orient='index', columns=['Accuracy Perceptron','Accuracy Random Forest','Accuracy SVM'])
summary_bclass

## Subcategories Classification (Multi-Class Classification)

In [None]:
# use of previous features
from sklearn.model_selection import train_test_split

# Encoding the target 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = data['cat']
y = pd.Series(encoder.fit_transform(y))

# Split the data : Train set & Test Set 
f = [features_1, features_2, features_3, features_4, features_5, features_6, features_7, features_8, features_9]
features = []
for feature in f: 
    X_train, X_test, y_train, y_test = train_test_split(feature, y, test_size=0.33, random_state=42)
    features.append([X_train, X_test, y_train, y_test])

In [None]:
#Here the goal is to make a loop to test all data representation method. We also classification function intanciate above
#Same as previous cells
methods = {}
reports = {}
confusion_matrices = {}
for i, feature in enumerate(features): 
    X_train, X_test, y_train, y_test = feature[0], feature[1], feature[2], feature[3]
    accuracy_perceptron, report_perceptron, confusion_mat_perceptron = perceptron(X_train, y_train, X_test, y_test)
    accuracy_rf, report_rf, confusion_mat_rf = random_forest(X_train, y_train, X_test, y_test)
    accuracy_svm, report_svm, confusion_mat_svm = svm(X_train, y_train, X_test, y_test)
    methods['method_' + str(i+1)] = [accuracy_perceptron, accuracy_rf, accuracy_svm]
    reports['method_' + str(i+1)] = [report_perceptron, report_rf, report_svm]
    confusion_matrices['method_' + str(i)] = [confusion_mat_perceptron, confusion_mat_rf, confusion_mat_svm]

In [None]:
#To have a better presentation of results we build a dataframe 
summary_bclass = pd.DataFrame.from_dict(methods, orient='index', columns=['Accuracy Perceptron','Accuracy Random Forest','Accuracy SVM'])
summary_bclass