In [1]:
import numpy as np 
import pandas as pd 
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

# Read file and change the representation of keywords from dict frenquency to a matrix can be analysed by CountVectorizer

In [2]:
mydata = pd.read_csv('./train.csv')
mydata.head()

Unnamed: 0,ID,keywords,age,sex
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62,F
1,2,restaurant:1;marrakech.shtml:1,35,M
2,3,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45,F
3,4,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46,F
4,5,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42,F


In [3]:
df = mydata.copy()
df_nona = df.dropna().reset_index(drop = True)
df_nona.shape

(6418659, 4)

In [4]:
df_nona['nbr_words'] = df_nona['keywords'].apply(lambda x : x.count(";"))

In [5]:
droprows = df_nona[df_nona['nbr_words'] < 50].index
df_nona = df_nona.drop(droprows).reset_index(drop = True)

In [6]:
def listToDict(lst):
    '''
    convert a list to dictionary
    e.g. 'restaurant:1;marrakech.shtml:1' --> {'restaurant:1, marrakech.shtml:1}
    
    '''
    op = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}

    return op

def real_sent(input_dict):
    '''
    combine the word with frequency to one element 
    '''
    sent = ''
    for word in input_dict:
        sent = sent + (word +' ') * input_dict[word]
    return sent

def token(data):
    '''
    from word with frequency to a complete piece 
    e.g. 'restaurant:1;marrakech.shtml:1' --> restaurant marrakech.shtml
    '''

    target = []
    for element in data:
        # split the word frequency 
        element = element.replace(';', ':')
        element = element.split(':')
        
        # skip the Indexerror 
        try:
            the_dict = listToDict(element)
        except IndexError:
            pass
        
        # final data
        for key in the_dict:
            the_dict[key] = int(the_dict[key])
        target.append(real_sent(the_dict))   
    return target

In [7]:
df = df_nona.iloc[:10000,:]
tokens = token(df.keywords)
df['text'] = pd.DataFrame(tokens)
df['text'] = df['text'].apply(lambda x : x.replace('é','e'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = pd.DataFrame(tokens)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x : x.replace('é','e'))


In [8]:
df

Unnamed: 0,ID,keywords,age,sex,nbr_words,text
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62,F,57,fibre fibre fibre fibre fibre fibre fibre fibr...
1,24,tabac:1;les:1;terrorisme:1;excuses:2;luxe:2;do...,40,M,68,tabac les terrorisme excuses excuses luxe luxe...
2,28,lively:1;messi:1;enzo:1;nolwenn:1;diaz:1;kourt...,50,F,111,lively messi enzo nolwenn diaz kourtney beckha...
3,33,adams:1;refaeli:1;paltrow:1;brodier:2;ariane:2...,52,F,230,adams refaeli paltrow brodier brodier ariane a...
4,42,service:1;siege:1;avenue:1;douleur+a+la+poitri...,62,F,155,service siege avenue douleur+a+la+poitrine+cot...
...,...,...,...,...,...,...
9995,170024,delon:1;sardou:2;loup:1;sur:1;bernard:1;martin...,34,F,67,delon sardou sardou loup sur bernard martin pa...
9996,170058,pierre:2;chazal:1;chazel:1;photos:33;loup:1;br...,43,F,69,pierre pierre chazal chazel photos photos phot...
9997,170073,sortir:22;terrain:1;enflamme:1;avec:1;pratique...,56,M,64,sortir sortir sortir sortir sortir sortir sort...
9998,170079,lavoine:1;arcady:1;saluer:1;hossein:1;chazel:1...,28,M,72,lavoine arcady saluer hossein chazel robert so...


# clean the text to proper text by lemmatizer and tokenize and remove the meaningless words

In [9]:
#removes stopwords, numbers, meaningless words
nlp = spacy.load('fr_core_news_md')

In [10]:
def clean_my_text(data, col1):
    def cleanline(k):
        doc = nlp(k)
        mystr = ''
        for token in doc:
            if token.is_alpha and token.has_vector and (not token.is_stop):
                mystr += token.lemma_.replace('é','e') + ' '
        return mystr
    kk = data[col1].apply(lambda x:cleanline(x))
    mydff = pd.DataFrame(kk)
    return mydff

In [11]:
df_test = df
df_test['token_clean'] = clean_my_text(df_test, 'text')
df_test.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['token_clean'] = clean_my_text(df_test, 'text')


(10000, 7)

In [12]:
df_test

Unnamed: 0,ID,keywords,age,sex,nbr_words,text,token_clean
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62,F,57,fibre fibre fibre fibre fibre fibre fibre fibr...,fibre fibre fibr fibr fibr fibr fibr fibr fibr...
1,24,tabac:1;les:1;terrorisme:1;excuses:2;luxe:2;do...,40,M,68,tabac les terrorisme excuses excuses luxe luxe...,tabac terrorisme excuse excuse lux lux dollar ...
2,28,lively:1;messi:1;enzo:1;nolwenn:1;diaz:1;kourt...,50,F,111,lively messi enzo nolwenn diaz kourtney beckha...,lively messi end end end end end end end end e...
3,33,adams:1;refaeli:1;paltrow:1;brodier:2;ariane:2...,52,F,230,adams refaeli paltrow brodier brodier ariane a...,ariane ariane victoria tobe chanson fete fet f...
4,42,service:1;siege:1;avenue:1;douleur+a+la+poitri...,62,F,155,service siege avenue douleur+a+la+poitrine+cot...,service siege avenue gerard transfert historiq...
...,...,...,...,...,...,...,...
9995,170024,delon:1;sardou:2;loup:1;sur:1;bernard:1;martin...,34,F,67,delon sardou sardou loup sur bernard martin pa...,loup bernard martin pascal pascal lang journal...
9996,170058,pierre:2;chazal:1;chazel:1;photos:33;loup:1;br...,43,F,69,pierre pierre chazal chazel photos photos phot...,pierre pierre photo photo photo photo photo ph...
9997,170073,sortir:22;terrain:1;enflamme:1;avec:1;pratique...,56,M,64,sortir sortir sortir sortir sortir sortir sort...,sortir sortir sortir sortir sortir sortir sort...
9998,170079,lavoine:1;arcady:1;saluer:1;hossein:1;chazel:1...,28,M,72,lavoine arcady saluer hossein chazel robert so...,saluer robert bernard pascal pascal mari mari ...


# TfidfVectorizer and train models

In [13]:
#split training and test set
Y = df_test[["sex","age"]]
X_train_pre, X_test_pre, y_train, y_test = train_test_split(df_test.token_clean, Y, test_size=0.2, random_state=66)
tfidf = TfidfVectorizer(min_df=0.01)
tfidffit = tfidf.fit(X_train_pre)
X_train = tfidffit.transform(X_train_pre)
#Dump file
pickle.dump(tfidffit, open("tfidf1.pkl", "wb"))
mycv = pickle.load(open("tfidf1.pkl", 'rb'))
cv_test = TfidfVectorizer(vocabulary = mycv.vocabulary_)
cv_test_fit = cv_test.fit(X_test_pre)
X_test = cv_test.transform(X_test_pre)

In [14]:
df_test

Unnamed: 0,ID,keywords,age,sex,nbr_words,text,token_clean
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62,F,57,fibre fibre fibre fibre fibre fibre fibre fibr...,fibre fibre fibr fibr fibr fibr fibr fibr fibr...
1,24,tabac:1;les:1;terrorisme:1;excuses:2;luxe:2;do...,40,M,68,tabac les terrorisme excuses excuses luxe luxe...,tabac terrorisme excuse excuse lux lux dollar ...
2,28,lively:1;messi:1;enzo:1;nolwenn:1;diaz:1;kourt...,50,F,111,lively messi enzo nolwenn diaz kourtney beckha...,lively messi end end end end end end end end e...
3,33,adams:1;refaeli:1;paltrow:1;brodier:2;ariane:2...,52,F,230,adams refaeli paltrow brodier brodier ariane a...,ariane ariane victoria tobe chanson fete fet f...
4,42,service:1;siege:1;avenue:1;douleur+a+la+poitri...,62,F,155,service siege avenue douleur+a+la+poitrine+cot...,service siege avenue gerard transfert historiq...
...,...,...,...,...,...,...,...
9995,170024,delon:1;sardou:2;loup:1;sur:1;bernard:1;martin...,34,F,67,delon sardou sardou loup sur bernard martin pa...,loup bernard martin pascal pascal lang journal...
9996,170058,pierre:2;chazal:1;chazel:1;photos:33;loup:1;br...,43,F,69,pierre pierre chazal chazel photos photos phot...,pierre pierre photo photo photo photo photo ph...
9997,170073,sortir:22;terrain:1;enflamme:1;avec:1;pratique...,56,M,64,sortir sortir sortir sortir sortir sortir sort...,sortir sortir sortir sortir sortir sortir sort...
9998,170079,lavoine:1;arcady:1;saluer:1;hossein:1;chazel:1...,28,M,72,lavoine arcady saluer hossein chazel robert so...,saluer robert bernard pascal pascal mari mari ...


In [15]:
tfidf.get_feature_names()

['abandonnee',
 'abonnement',
 'abonner',
 'abord',
 'absence',
 'accident',
 'achat',
 'acteur',
 'action',
 'actu',
 'actualit',
 'actualite',
 'actualiter',
 'actualites',
 'adepte',
 'adsl',
 'advocaat',
 'affaire',
 'affirment',
 'afp',
 'age',
 'agent',
 'agresse',
 'agression',
 'aid',
 'aide',
 'air',
 'aire',
 'alain',
 'alcool',
 'alerte',
 'alexandra',
 'alexandre',
 'alexandrer',
 'aliment',
 'alliance',
 'alsace',
 'ambitieux',
 'amelie',
 'americain',
 'ami',
 'amincir',
 'amour',
 'amoureux',
 'an',
 'anatomy',
 'ancien',
 'and',
 'anderson',
 'ang',
 'angelique',
 'animal',
 'ann',
 'anne',
 'annee',
 'annonce',
 'annoncer',
 'annulation',
 'anti',
 'antiraciste',
 'antoin',
 'antoine',
 'aout',
 'apl',
 'appartement',
 'apple',
 'approuve',
 'arabie',
 'argent',
 'argentin',
 'ariane',
 'armee',
 'arnaque',
 'arras',
 'art',
 'article',
 'artisans',
 'artu',
 'artus',
 'assassinat',
 'association',
 'assuranc',
 'assurance',
 'astuce',
 'athletisme',
 'atlantique',
 'a

In [16]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train["sex"].tolist())
y_pred_sex = clf.predict(X_test)

In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test["sex"],y_pred_sex))

              precision    recall  f1-score   support

           F       0.78      0.76      0.77      1132
           M       0.69      0.72      0.71       868

    accuracy                           0.74      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.74      0.74      0.74      2000



In [20]:
df_test.to_csv('mycleandatanew.csv')

# Age prediction

In [21]:
from sklearn.ensemble import GradientBoostingRegressor
gbdt = GradientBoostingRegressor()
gbdt.fit(X_train, y_train["age"].tolist())
y_pred_age = gbdt.predict(X_test)

In [22]:
from sklearn.metrics import mean_squared_error,r2_score
mse = mean_squared_error(y_test["age"], y_pred_age)
mse

151.34482142073617

In [24]:
r2_score(y_test["age"], y_pred_age)

0.04756171092028538

# predict and fill up the test.csv file

In [25]:
df_target = pd.read_csv('test.csv')

In [26]:
df_target

Unnamed: 0,ID,keywords,age,sex
0,1,,,
1,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,
2,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,
3,4,002lundu83vnndv:1,,
4,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,
...,...,...,...,...
3111780,3111781,frere:1;bien:1;definition:1;7e72:1;sur:3;11e7:...,,
3111781,3111782,,,
3111782,3111783,,,
3111783,3111784,,,


In [27]:
narows = df_target[df_target['keywords'].isna()].index

In [28]:
df_target = df_target.drop(narows).reset_index(drop = True)

In [29]:
test_tokens = token(df_target.keywords)
df_target['text'] = pd.DataFrame(test_tokens)
df_target

Unnamed: 0,ID,keywords,age,sex,text
0,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,,cecilia.gosselin flash ville obseques economie...
1,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,,p1_1697235 peut jcms les acceptees pas benefic...
2,4,002lundu83vnndv:1,,,002lundu83vnndv
3,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,,high high high patisserie apple apple apple te...
4,6,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,,,disparition vue maelys deuxieme place actu fla...
...,...,...,...,...,...
2748738,3111777,suspendu:1;medicament:1;nouveau:1;sur:1;actual...,,,suspendu medicament nouveau sur actualite marche
2748739,3111778,chocolat:1;recettes:1;eclats:1;crepes:1;sucree...,,,chocolat recettes eclats crepes sucrees pistac...
2748740,3111779,forum:1;changer:1;vrai:1;twitter:1;affich:1;so...,,,forum changer vrai twitter affich son nom
2748741,3111780,astuce:1;recettes:1;menu:1;gastronomie:1;repas...,,,astuce recettes menu gastronomie repas tomates...


In [31]:
df_target = df_target[:10000]
df_target['token_clean'] = clean_my_text(df_target, 'text')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target['token_clean'] = clean_my_text(df_target, 'text')


In [32]:
df_target

Unnamed: 0,ID,keywords,age,sex,text,token_clean
0,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,,cecilia.gosselin flash ville obseques economie...,flash ville obseque economie cinema chagrin un...
1,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,,p1_1697235 peut jcms les acceptees pas benefic...,assurance saisir vie fisc
2,4,002lundu83vnndv:1,,,002lundu83vnndv,
3,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,,high high high patisserie apple apple apple te...,high high high patisserie apple apple apple te...
4,6,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,,,disparition vue maelys deuxieme place actu fla...,disparition vue deuxieme place actu flash gard...
...,...,...,...,...,...,...
9995,11353,faq:2;forum:1;force:1;acompte:2;majeure:1;affi...,,,faq faq forum force acompte acompte majeure af...,faq faq forum force acompte acompte majeur loc...
9996,11354,mouhcine:1;garcon:1;prenom:1;prenoms:1,,,mouhcine garcon prenom prenoms,garcon prenom prenom
9997,11355,desprez:1;alain:1;obseques:1;pascal:1;cinema:1...,,,desprez alain obseques pascal cinema dans unis...,alain obseque pascal cinema unis chagrin
9998,11356,utilisateurs:1;mineur:10;gold:10;jeu:10;strike...,,,utilisateurs mineur mineur mineur mineur mineu...,utilisateur mineur mineur mineur mineur mineur...


In [33]:
cv_test = TfidfVectorizer(vocabulary = mycv.vocabulary_)
cv_test_fit = cv_test.fit(df_target['token_clean'])
X_to_pred = cv_test.transform(df_target['token_clean'])

In [34]:
df_target["sex"] = clf.predict(X_to_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target["sex"] = clf.predict(X_to_pred)


In [36]:
df_target["age"] = gbdt.predict(X_to_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target["age"] = gbdt.predict(X_to_pred)


In [37]:
df_target

Unnamed: 0,ID,keywords,age,sex,text,token_clean
0,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,46.988544,M,cecilia.gosselin flash ville obseques economie...,flash ville obseque economie cinema chagrin un...
1,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,48.907246,M,p1_1697235 peut jcms les acceptees pas benefic...,assurance saisir vie fisc
2,4,002lundu83vnndv:1,48.907246,F,002lundu83vnndv,
3,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,49.167906,M,high high high patisserie apple apple apple te...,high high high patisserie apple apple apple te...
4,6,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,48.417619,F,disparition vue maelys deuxieme place actu fla...,disparition vue deuxieme place actu flash gard...
...,...,...,...,...,...,...
9995,11353,faq:2;forum:1;force:1;acompte:2;majeure:1;affi...,47.147050,F,faq faq forum force acompte acompte majeure af...,faq faq forum force acompte acompte majeur loc...
9996,11354,mouhcine:1;garcon:1;prenom:1;prenoms:1,48.907246,F,mouhcine garcon prenom prenoms,garcon prenom prenom
9997,11355,desprez:1;alain:1;obseques:1;pascal:1;cinema:1...,49.197891,F,desprez alain obseques pascal cinema dans unis...,alain obseque pascal cinema unis chagrin
9998,11356,utilisateurs:1;mineur:10;gold:10;jeu:10;strike...,48.907246,F,utilisateurs mineur mineur mineur mineur mineu...,utilisateur mineur mineur mineur mineur mineur...


In [38]:
df_target = df_target[["ID","age","sex"]]
df_target

Unnamed: 0,ID,age,sex
0,2,46.988544,M
1,3,48.907246,M
2,4,48.907246,F
3,5,49.167906,M
4,6,48.417619,F
...,...,...,...
9995,11353,47.147050,F
9996,11354,48.907246,F
9997,11355,49.197891,F
9998,11356,48.907246,F


In [39]:
df_target.to_csv('mlfinalresult.csv')