# Question: 
# Age and Sex prediction from user keywoards (per session)

###    

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

In [2]:
PATH1='C:/Users/Michel El Kik/Documents/MICHEL EL KIK DOCS/ESCP DOCS/ESCP Semester 2/ML/Final Project ML/train.csv'
PATH2='C:/Users/Michel El Kik/Documents/MICHEL EL KIK DOCS/ESCP DOCS/ESCP Semester 2/ML/Final Project ML/test.csv'

In [3]:
train = pd.read_csv(PATH1)

In [4]:
test = pd.read_csv(PATH2)

### Testing with 1,000,000 instances (processing power limit for 1.5 Million instances)

In [5]:
train_df = train.head(1600000)
test_df =  test.head(1600000)

###    

# 1. Dataset Exploration

In [6]:
display(train_df.head(1600000), test_df.head(1600000))

Unnamed: 0,ID,keywords,age,sex
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62,F
1,2,restaurant:1;marrakech.shtml:1,35,M
2,3,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45,F
3,4,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46,F
4,5,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42,F
...,...,...,...,...
1599995,2284949,gaz:5;ramonage:5;chaudiere:5;affich:5;arnaque:...,65,M
1599996,2284950,calculatrice:1;affich:1;ticonnect:1;forum:1;co...,37,M
1599997,1222162,latecoere:1;profile:1;groupe:1;latelec:1;patri...,36,F
1599998,2284952,bousser:3;thil:5;inbox:8;etienne:3;copains:3;j...,47,M


Unnamed: 0,ID,keywords,age,sex
0,1,,,
1,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,
2,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,
3,4,002lundu83vnndv:1,,
4,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,
...,...,...,...,...
1599995,1599996,quatre:1;quart:1;recette:1,,
1599996,1599997,jcms:1;seuils:1;dexoneration:1;futurs:1;p1_169...,,
1599997,1599998,france:6;meteo:6;previsions:6;seillans:6;ville:6,,
1599998,1599999,jhabite:1;emplacement:1;beneficient:1;p1_13227...,,


In [7]:
(len(train_df), len(test_df))

(1600000, 1600000)

###    

# 2. Data Preprocessing: Dropping Missing Values

In [8]:
train_df = train_df.dropna(axis=0)

In [9]:
#train_df.head(1600000) 

## 2.1 Dropping "age" and "sex" informtion to predict them based on the keywords

In [10]:
test_df = test_df.dropna(subset=['keywords'])

In [11]:
test_df.head(1600000) 

Unnamed: 0,ID,keywords,age,sex
1,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,
2,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,
3,4,002lundu83vnndv:1,,
4,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,
5,6,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,,
...,...,...,...,...
1599995,1599996,quatre:1;quart:1;recette:1,,
1599996,1599997,jcms:1;seuils:1;dexoneration:1;futurs:1;p1_169...,,
1599997,1599998,france:6;meteo:6;previsions:6;seillans:6;ville:6,,
1599998,1599999,jhabite:1;emplacement:1;beneficient:1;p1_13227...,,


## 2.2 Number of rows after dropping NaN values

In [12]:
(len(train_df), len(test_df))

(1424015, 1419642)

###    

# 3. Feature Extraction: BoW Approach for the Training Set

In [13]:
train_df['class'] = train_df["age"].map(str) + train_df["sex"] #sticks age and sex in one cell

In [14]:
train_df.head(1600000)

Unnamed: 0,ID,keywords,age,sex,class
0,1,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62,F,62F
1,2,restaurant:1;marrakech.shtml:1,35,M,35M
2,3,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45,F,45F
3,4,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46,F,46F
4,5,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42,F,42F
...,...,...,...,...,...
1599995,2284949,gaz:5;ramonage:5;chaudiere:5;affich:5;arnaque:...,65,M,65M
1599996,2284950,calculatrice:1;affich:1;ticonnect:1;forum:1;co...,37,M,37M
1599997,1222162,latecoere:1;profile:1;groupe:1;latelec:1;patri...,36,F,36F
1599998,2284952,bousser:3;thil:5;inbox:8;etienne:3;copains:3;j...,47,M,47M


##  3.1 Removing the following columns: ID, Age, Sex

In [15]:
train_df = train_df.drop(train_df.columns[[0, 2, 3]], axis=1)

In [16]:
train_df.head(1600000)

Unnamed: 0,keywords,class
0,fibre:16;quoi:1;dangers:1;combien:1;hightech:1...,62F
1,restaurant:1;marrakech.shtml:1,35M
2,payer:1;faq:1;taxe:1;habitation:1;macron:1;qui...,45F
3,rigaud:3;laurent:3;photo:11;profile:8;photopro...,46F
4,societe:1;disparition:1;proche:1;m%c3%a9lanie....,42F
...,...,...
1599995,gaz:5;ramonage:5;chaudiere:5;affich:5;arnaque:...,65M
1599996,calculatrice:1;affich:1;ticonnect:1;forum:1;co...,37M
1599997,latecoere:1;profile:1;groupe:1;latelec:1;patri...,36F
1599998,bousser:3;thil:5;inbox:8;etienne:3;copains:3;j...,47M


## 3.2 Function for "keywords" transformation

In [17]:
def convert_to_words(keywords):
    list_of_words = []
    for x in keywords:
        list_of_words.append(x.split(":",1)[0]) 
        # split(separator, maxsplit)
        # maxsplit = 1  
        # [0] indicates column 0 (keywords)
        # splitting words from their respective frequencies
    
    return list_of_words

In [18]:
def create_dict(keywords):
    return dict(x.split(':') for x in keywords)

## 3.3 Applying the respective functions  to obtain the "class" attribute

In [19]:
x_df = train_df.copy()

In [20]:
x_df['keywords'] = x_df['keywords'].apply(lambda row: row.split(";"))

In [21]:
#x_df.head(1600000)

In [22]:
x_df['keywords'] = x_df['keywords'].apply(lambda row: convert_to_words(row))

In [23]:
#x_df.head(1600000)

In [24]:
x_df['keywords'] = x_df['keywords'].apply(lambda row: ' '.join(row))

In [25]:
x_df.head(1600000)

Unnamed: 0,keywords,class
0,fibre quoi dangers combien hightech que est re...,62F
1,restaurant marrakech.shtml,35M
2,payer faq taxe habitation macron qui detail pr...,45F
3,rigaud laurent photo profile photoprofile bon ...,46F
4,societe disparition proche m%c3%a9lanie.gonide...,42F
...,...,...
1599995,gaz ramonage chaudiere affich arnaque forum,65M
1599996,calculatrice affich ticonnect forum connexion,37M
1599997,latecoere profile groupe latelec patrice.prevost,36F
1599998,bousser thil inbox etienne copains jeannot new...,47M


In [26]:
#x_df['keywords'][300]

## 3.4 Bag of Words Scoring using TF-IDF

In [31]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.1, min_df = 0.01)
# vectorizer = DictVectorizer()

###    

# 4. Partitioning: Split dataset into Train and Test Sets

In [32]:
X = x_df["keywords"]
y = x_df["class"]

In [33]:
X_train_df, X_test_df, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
X_train = vectorizer.fit_transform(X_train_df)

In [35]:
X_test = vectorizer.transform(X_test_df)

In [36]:
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()

In [37]:
len(vectorizer.vocabulary_)

276

###    

# 5. Classification: Fitting Multinomial Naive Bayes to the Training Set

In [38]:
#Creating a Bag of Words Model
clf = MultinomialNB(alpha=.01)
clf.fit(X_train, y_train)

#predicting test set results
pred = clf.predict(X_test)

#scoring the model using the accuracy metric (Classification Accuracy)
score = metrics.accuracy_score(y_test, pred)

print("accuracy:   %0.3f" % score)

accuracy:   0.024


###    

# 6. NB Model Evaluation: Cross Validation using k-fold

In [39]:
# A 10-fold cross-validation test harness is used to demonstrate each metric
clf = MultinomialNB(alpha=.01)
clf.fit(X_train, y_train)
y_pred = cross_val_predict(clf, X_train, y_train, cv=10, n_jobs=-1, verbose=20)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 29.8min
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed: 29.8min remaining: 119.3min
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed: 29.8min remaining: 69.6min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed: 29.8min remaining: 44.7min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 29.8min remaining: 29.8min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 29.8min remaining: 19.9min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 29.8min remaining: 12.8min
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed: 29.8min remaining:  7.5min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 31.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 31.0min finished


## 6.1 Cross Validation Accuracy

In [40]:
metrics.accuracy_score(y_test, pred)

0.02401660094872596

###    

# 7. Test Set Prediction

## 7.1 Feature Extraction: BoW Approach for Test Set 

In [41]:
test_df2 = test_df.copy()

In [42]:
test_df2['keywords'] = test_df2['keywords'].apply(lambda row: row.split(";"))

In [43]:
test_df2.head(1600000)

Unnamed: 0,ID,keywords,age,sex
1,2,"[cecilia.gosselin:1, flash:1, ville:1, obseque...",,
2,3,"[p1_1697235:1, peut:1, jcms:1, les:1, acceptee...",,
3,4,[002lundu83vnndv:1],,
4,5,"[high:3, patisserie:1, apple:3, tech:3, obsequ...",,
5,6,"[disparition:1, vue:1, maelys:1, deuxieme:1, p...",,
...,...,...,...,...
1599995,1599996,"[quatre:1, quart:1, recette:1]",,
1599996,1599997,"[jcms:1, seuils:1, dexoneration:1, futurs:1, p...",,
1599997,1599998,"[france:6, meteo:6, previsions:6, seillans:6, ...",,
1599998,1599999,"[jhabite:1, emplacement:1, beneficient:1, p1_1...",,


In [44]:
test_df2['keywords'] = test_df2['keywords'].apply(lambda row: convert_to_words(row))

In [45]:
#test_df2.head(1600000)

In [46]:
test_df2['keywords'] = test_df2['keywords'].apply(lambda row: ' '.join(row))

In [47]:
#test_df2.head(1600000)

In [48]:
test_df2['keywords'][2]

'p1_1697235 peut jcms les acceptees pas beneficiaire par assurances saisir vie fisc'

## 7.2 Attribute Prediction: 'prediction' = P('class')

In [49]:
test = vectorizer.transform(test_df2['keywords'])

In [50]:
pred2 = clf.predict(test)

In [51]:
pred2[1]

'66M'

In [52]:
test_df["prediction"] = pred2

In [53]:
test_df.head(1600000)

Unnamed: 0,ID,keywords,age,sex,prediction
1,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,,48F
2,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,,66M
3,4,002lundu83vnndv:1,,,42M
4,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,,48F
5,6,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,,,36F
...,...,...,...,...,...
1599995,1599996,quatre:1;quart:1;recette:1,,,40F
1599996,1599997,jcms:1;seuils:1;dexoneration:1;futurs:1;p1_169...,,,63M
1599997,1599998,france:6;meteo:6;previsions:6;seillans:6;ville:6,,,59M
1599998,1599999,jhabite:1;emplacement:1;beneficient:1;p1_13227...,,,52M


## 7.3 Equating "sex" to "prediction" 

In [54]:
test_df["sex"] = test_df["prediction"]

In [55]:
test_df.head(1600000)

Unnamed: 0,ID,keywords,age,sex,prediction
1,2,cecilia.gosselin:1;flash:1;ville:1;obseques:1;...,,48F,48F
2,3,p1_1697235:1;peut:1;jcms:1;les:1;acceptees:1;p...,,66M,66M
3,4,002lundu83vnndv:1,,42M,42M
4,5,high:3;patisserie:1;apple:3;tech:3;obseques:1;...,,48F,48F
5,6,disparition:1;vue:1;maelys:1;deuxieme:1;place:...,,36F,36F
...,...,...,...,...,...
1599995,1599996,quatre:1;quart:1;recette:1,,40F,40F
1599996,1599997,jcms:1;seuils:1;dexoneration:1;futurs:1;p1_169...,,63M,63M
1599997,1599998,france:6;meteo:6;previsions:6;seillans:6;ville:6,,59M,59M
1599998,1599999,jhabite:1;emplacement:1;beneficient:1;p1_13227...,,52M,52M


In [57]:
test_df['sex'] = test_df['sex'].apply(lambda row: re.sub("\d+", "", row))

In [58]:
#test_df.head(1600000)

In [60]:
test_df["age"] = test_df["prediction"]

In [61]:
# re.sub(chars_to_replace, string_to_replace_with, str)

test_df['age'] = test_df['age'].apply(lambda row: re.sub("\D+", "", row))

In [62]:
#test_df.head(1600000)

In [63]:
test_df = test_df.drop(['prediction'] , axis=1)

In [64]:
test_df = test_df.drop(['keywords'] , axis=1)

In [65]:
test_df.head(1600000)

Unnamed: 0,ID,age,sex
1,2,48,F
2,3,66,M
3,4,42,M
4,5,48,F
5,6,36,F
...,...,...,...
1599995,1599996,40,F
1599996,1599997,63,M
1599997,1599998,59,M
1599998,1599999,52,M


## 7.4 Predicted Dataset: Final Format

In [67]:
new_test_df = test_df.rename(columns={'age': 'age_pred' , 'sex': 'sex_pred'})

In [68]:
new_test_df.head(1600000)

Unnamed: 0,ID,age_pred,sex_pred
1,2,48,F
2,3,66,M
3,4,42,M
4,5,48,F
5,6,36,F
...,...,...,...
1599995,1599996,40,F
1599996,1599997,63,M
1599997,1599998,59,M
1599998,1599999,52,M


###    

# 8. Exporting Predicted Dataset as CSV

In [69]:
new_test_df.to_csv(r'C:/Users/Michel El Kik/Documents/MICHEL EL KIK DOCS/ESCP DOCS/ESCP Semester 2/ML/Final Project ML/predicted_set.csv', index=False)

###    