<a href="https://colab.research.google.com/github/okliviaf/PredictingPS/blob/master/Per_query_PS_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -U -q PyDrive
!pip install -U -q seaborn
!pip install -U -q tensorflow-gpu==1.15.2
from tensorflow import set_random_seed
import numpy as np
import seaborn as sns
import pandas as pd
import csv
import os
from sklearn import svm
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files
from keras.models import Sequential
from keras.layers import Dense
from sklearn import preprocessing
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
import numpy
from collections import namedtuple
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Dataset = namedtuple("Dataset", ["X", "y", "num_features", "num_classes"])

# Data Loader

The data for the machine learning models training have been uploaded on Google Drive and made freely available. We use the Google Drive Python API in order to automatically download the data in Google Colab. 

Alternatively, if you decide to download the notebook and run it locally, please remember to set the variable `LOCAL_RUN` to `True`.

In [0]:
LOCAL_RUN = False

if not LOCAL_RUN:
    # we create the 'data' folder that will contain the training data
    if not os.path.exists("data"):
        os.makedirs("data")
    downloaded = drive.CreateFile({"id": "1kyHohSD4iAYJWv6ryUQUpkPjbw1nOryO"})
    downloaded.GetContentFile("data/FA_Binary.csv")
    
    downloaded = drive.CreateFile({"id": "1zXWz49xBUU7iQ9AAokXWr_9vku3r1Yms"})
    downloaded.GetContentFile("data/NC_Binary.csv")

    downloaded = drive.CreateFile({"id": "1qdeITwpqP2Xvwuu5j7BHbWriL1DW8XSM"})
    downloaded.GetContentFile("data/Overall_Trio.csv")
    
FA_Binary = pd.read_csv("data/FA_Binary.csv")
NC_Binary = pd.read_csv("data/NC_Binary.csv")
Overall_Trio = pd.read_csv("data/Overall_Trio.csv")


In [0]:
print("'Finding As' dataset size: {}".format(FA_Binary.shape))
print("'Number comparison' dataset size: {}".format(NC_Binary.shape))
print("'Overall' dataset size: {}".format(Overall_Trio.shape))

In [0]:
attributes = [
  'no_of_actions',
  'time_query',
  'time_on_serp',
  'time_on_documents',
   'time_session_overall',
    'serp_page_viewed_to',
    'document_click_count',
    'document_click_depth',
    'document_hover_count_raw',
    'document_hover_count',
    'document_hover_depth',
    'ad_hover_count',
    'ad_hover_count_top',
    'ad_hover_count_side',
    'ad_hover_count_bot',
    'ad_click_count',
    'depth',
    'time_per_snippet',
    'time_per_document',
    'query_length',
    'query_tokens_count'
]

seed = 7
num_splits = 5
max_num_attributes = 5
feature_selector = SelectKBest(mutual_info_classif, k=max_num_attributes)

feature_scores = []

# fix random seed for reproducibility
numpy.random.seed(seed)
set_random_seed(seed)

In [0]:
# Data preparation
def dataset_preparation(dataset_df, num_classes, feature_names, apply_feature_selection=False):
    label_encoder = preprocessing.LabelEncoder()
    
    # split into input (X) and output (Y) variables
    X = preprocessing.scale(dataset_df[feature_names].to_numpy())
    y = label_encoder.fit_transform(dataset_df.iloc[:, -1].to_numpy().tolist())

    if apply_feature_selection:
        X = feature_selector.fit_transform(X, y)
        num_features = X.shape[-1]
        feature_scores.append(feature_selector.scores_)
    else:
        num_features = len(feature_names)

    return Dataset(X=X, y=y, num_features=num_features, num_classes=num_classes)

In [0]:
datasets = {
    "FA_Binary": dataset_preparation(FA_Binary, 2, attributes, apply_feature_selection=True),
    "NC_Binary": dataset_preparation(NC_Binary, 2, attributes, apply_feature_selection=True),
    "Overall_Trio": dataset_preparation(Overall_Trio, 3, attributes, apply_feature_selection=True)
}

for i, (dataset_name, dataset) in enumerate(datasets.items()):
    print("Dataset name: {}".format(dataset_name))
    indices = np.argsort(feature_scores[i])[::-1][:10]
    for ind in indices:
        print(f"- {attributes[ind]} ({feature_scores[i][ind]})")


In [0]:
def nn_classification_model(input_size, output_size, hidden_sizes=[64], activation="relu"):
    """
        Creates a Feed-forward Neural Network (1-layer) with an output layer of 3 neurons.
        Every output neuron represents a specific class (low, medium, high).

        The final Softmax activation function is used to obtain a probability distribution 
        over the classes (low, medium, high)
    """
    def model_definition():
        model = Sequential()
        model.add(Dense(hidden_sizes[0], input_dim=input_size, kernel_initializer='glorot_uniform', activation=activation))

        for hidden_size in hidden_sizes[1:]:
            model.add(Dense(hidden_size, kernel_initializer='glorot_uniform', activation=activation))
            model.add(Dropout(0.35, seed=seed))

        # output layer
        model.add(Dense(output_size, kernel_initializer='glorot_uniform', activation="softmax"))
        adam = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, amsgrad=False)

        # Compile model
        model.compile(loss='sparse_categorical_crossentropy',  metrics=["accuracy"], optimizer = adam)
            
        return model
    
    return model_definition

def create_pipeline(input_size, output_size):
    return {
        "neural": KerasClassifier(
            nn_classification_model(input_size, output_size, hidden_sizes=[32]),
            epochs=50, 
            batch_size=32
        ),
        "neural_32_16": KerasClassifier(
            nn_classification_model(input_size, output_size, hidden_sizes=[32, 16]),
            epochs=50, 
            batch_size=32
        ),
        "neural_32_16_8": KerasClassifier(
            nn_classification_model(input_size, output_size, hidden_sizes=[32, 16, 8]),
            epochs=50, 
            batch_size=32
        ),
        "svm": svm.SVC(shrinking=False, decision_function_shape="ovo", random_state=seed),
        "decision_tree": DecisionTreeClassifier(criterion="entropy", random_state=seed),
        "random_forest": RandomForestClassifier(n_estimators=10, random_state=seed),
        "majority_class": DummyClassifier(strategy="most_frequent", random_state=seed),
        "logistic_regression": LogisticRegression(solver="lbfgs", multi_class="multinomial", random_state=seed)
    }

In [0]:
def save_results(filename, all_results):
    model_names = list(list(all_results.values())[0].keys())

    with open(filename, mode="w") as out_file:
        writer = csv.writer(out_file)
        
        writer.writerow(["Dataset"] + model_names)
        for dataset, results in all_results.items():
            writer.writerow([dataset] + [results[model]["mean_accuracy"] for model in model_names])

In [0]:
datasets = {
    "FA_Binary": dataset_preparation(FA_Binary, 2, attributes),
    "NC_Binary": dataset_preparation(NC_Binary, 2, attributes),
    "Overall_Trio": dataset_preparation(Overall_Trio, 3, attributes)
}

all_results = {}

for dataset_name, dataset in datasets.items():
    print("# Starting train/test procedure for dataset {}".format(dataset_name))
    all_results[dataset_name] = {}
    pipeline = create_pipeline(dataset.num_features, dataset.num_classes)

    for model_name, model in pipeline.items():
        kfold = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)
        scores = cross_val_score(model, dataset.X, dataset.y, cv=kfold)

        all_results[dataset_name][model_name] = {
            "mean_accuracy": scores.mean(),
            "std_accuracy": scores.std()
        }
    
    print("Model\tAccuracy")
    for model, results in all_results[dataset_name].items():
        print(f"{model}\t{results['mean_accuracy']}")
        

In [0]:
results_file = "experiment_results_all_attributes.csv"

print("# Saving results to file {}".format(results_file))
save_results(results_file, all_results)

In [0]:
datasets = {
    "FA_Binary": dataset_preparation(FA_Binary, 2, attributes, apply_feature_selection=True),
    "NC_Binary": dataset_preparation(NC_Binary, 2, attributes, apply_feature_selection=True),
    "Overall_Trio": dataset_preparation(Overall_Trio, 3, attributes, apply_feature_selection=True)
}

all_results = {}

for dataset_name, dataset in datasets.items():
    print("# Starting train/test procedure for dataset {}".format(dataset_name))
    all_results[dataset_name] = {}
    pipeline = create_pipeline(dataset.num_features, dataset.num_classes)

    for model_name, model in pipeline.items():
        kfold = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=seed)
        scores = cross_val_score(model, dataset.X, dataset.y, cv=kfold)

        all_results[dataset_name][model_name] = {
            "mean_accuracy": scores.mean(),
            "std_accuracy": scores.std()
        }
    
    print("Model\tAccuracy")
    for model, results in all_results[dataset_name].items():
        print(f"{model}\t{results['mean_accuracy']}")

In [0]:
results_file = "experiment_results_with_feature_selection.csv"

print("# Saving results to file {}".format(results_file))
save_results(results_file, all_results)