In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
%matplotlib inline 
import matplotlib
import matplotlib.pyplot as plt

### Import package

In [None]:
import sys
import os
sys.path.append('../')
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Data cleaning").getOrCreate()
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from yaml import load as yaml_load
import pyspark.sql.functions as func
from sklearn.metrics import confusion_matrix
import numpy as np

### Import classes from src

In [None]:
from src.data.make_dataset import LoadDataframe
from src.features.build_features import extract_features_classification
from src.models.train_model import model_classification

### Functions to load yaml file configuration

In [None]:
def _load_config_file(config_file):
    """
    Load configuration file
    :param config_file: is the configuration file
    :return: configuration
    :rtype: dict
    """
    with open(config_file) as yml_config:
        return yaml_load(yml_config)

def _build_configuration(config_file):
    """
    Build the operation configuration dict
    :param config_file: is the path to the yaml config_file
    :type: string
    :return: config: global configuration
    :rtype dict
    """
    # yaml config
    config = _load_config_file(config_file)
    return config


In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    import itertools
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.figure(figsize=(15, 15))

### Loads yaml file configuration

In [None]:
config_file = "/home/ml/Documents/Crime_Chigaco_Spark/config/config.yml"
config = _build_configuration(config_file)

In [None]:
config['model_ML_classification']['train']

### Loads  LoadDataframe class, for more informations you can use help(LoadDataframe)

In [None]:
%%time
obj_df_loaded = LoadDataframe(config, '2012', '2015')

### loads temperature data, 
### socio-economic data, 
### weather description.
### remarks: weather description and temperature datas are valid from 2012

In [None]:
df_crimes_socio = obj_df_loaded.df_crime_socio()
df_temp = obj_df_loaded.df_temperature()
df_sky  = obj_df_loaded.df_sky()

### Loads extract_features_classification

In [None]:
obj_extract_features_classification = extract_features_classification(config, df_crimes_socio, df_temp, df_sky)

In [None]:
df_ml = obj_extract_features_classification.extract_feature()

### Loads model_classification

In [None]:
obj_model_classification = model_classification(config, df_ml)

### Show some data from df_ml (dataframe for machine learning, features extraction)

In [None]:
df_ml.limit(10).toPandas().T

### Training model, in this case we use random forest classifier  trained on 2012 - 2014 data

In [None]:
%%time
rf_model = obj_model_classification.train_RF()

### the model will be tested on 2015 data

In [None]:
%%time
obj_df_loaded_2015 = LoadDataframe(config, '2015', '2016')

### Loads data for testing

In [None]:
df_crimes_socio_2015 = obj_df_loaded_2015.df_crime_socio()
df_temp_2015 = obj_df_loaded_2015.df_temperature()
df_sky_2015  = obj_df_loaded_2015.df_sky()

### Extract features for 2015 data

In [None]:
obj_extract_features_classification_2015 = extract_features_classification(config, df_crimes_socio_2015,\
                                                                           df_temp_2015, df_sky_2015)

In [None]:
df_ml_test = obj_extract_features_classification_2015.extract_feature()

### Prediction

In [None]:
df_prediction_test = rf_model.transform(df_ml_test)

### Show some results

In [None]:
df_prediction_test.select('primary_type','label', 'prediction','predictedLabel').limit(1000).toPandas().sample(10)

### Evaluation of model

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [None]:
predictions = df_prediction_test.select(func.col("label"), func.col("prediction")).cache()

In [None]:
accuracy = evaluator.evaluate(predictions)

## Score 

In [None]:
y_test = df_prediction_test.select('primary_type').toPandas()['primary_type']

In [None]:
y_pred = df_prediction_test.select('predictedLabel').toPandas()['predictedLabel']

In [None]:
print(accuracy)

### Confusion matrix

In [None]:
# Compute confusion matrix

cnf_matrix = confusion_matrix(y_test, y_pred) 
np.set_printoptions(precision=2)
class_names = ['NARCOTICS','ASSAULT_BATTERY', 'THEFT_ROBBERY_BURGLARY']
# Plot non-normalized confusion matrix
plt.figsize = (10, 10)
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figsize = (10, 10)
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

In [None]:
from pyspark.ml.classification import RandomForestClassificationModel

from pyspark.ml import PipelineModel

In [None]:
pipelineModel = PipelineModel.load('/home/ml/Documents/Crime_Chigaco_Spark/models/rfModel')

In [None]:
config['model_ML_classification']['path']['path_model_rf']