In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
%matplotlib inline 
import matplotlib
import matplotlib.pyplot as plt

### Import package

In [21]:
import sys
import os
sys.path.append('../')
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Data cleaning").getOrCreate()
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from yaml import load as yaml_load

### Import classes from src

In [22]:
from src.data.make_dataset import LoadDataframe
from src.features.build_features import extract_features_regression
from src.models.train_model import model_regression

### Functions to load yaml file configuration

In [23]:
def _load_config_file(config_file):
    """
    Load configuration file
    :param config_file: is the configuration file
    :return: configuration
    :rtype: dict
    """
    with open(config_file) as yml_config:
        return yaml_load(yml_config)

def _build_configuration(config_file):
    """
    Build the operation configuration dict
    :param config_file: is the path to the yaml config_file
    :type: string
    :return: config: global configuration
    :rtype dict
    """
    # yaml config
    config = _load_config_file(config_file)
    return config


In [24]:
def visualisation_prediction(y_test, y_pred):
    import matplotlib.pyplot as plt
    import matplotlib
    matplotlib.rc('xtick', labelsize=30) 
    matplotlib.rc('ytick', labelsize=30) 
    fig, ax = plt.subplots(figsize=(50, 40))
    plt.style.use('ggplot')
    plt.plot(y_pred, y_test, 'ro')
    plt.xlabel('Predicted Crime', fontsize = 30)
    plt.ylabel('Actual Crime', fontsize = 30)
    plt.title('Predicted Y (Crimes) to the Actual Y (Crimes)', fontsize = 30)
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)

### Loads yaml file configuration

In [25]:
config_file = "/home/ml/Documents/Crime_Chigaco_Spark/config/config.yml"
config = _build_configuration(config_file)

In [26]:
config['model_ML_regression']['train_mode']

{'train': True, 'start_date': 2012, 'end_date': 2013}

### Loads  LoadDataframe class, for more informations you can use help(LoadDataframe)

In [27]:
%%time
obj_df_loaded = LoadDataframe(config, '2012', '2013')

CPU times: user 247 µs, sys: 50 µs, total: 297 µs
Wall time: 303 µs


In [None]:
df_nb_crimes = obj_df_loaded.df_nb_crimes()

In [None]:
df_nb_crimes.limit(5).toPandas().T

In [12]:
obj_extract_features_regression =  extract_features_regression(config, df_nb_crimes)

In [13]:
df_ml = obj_extract_features_regression.extract_feature()

In [16]:
df_crime = obj_df_loaded.df_crime()



In [14]:
df_ml.printSchema()

root
 |-- label: long (nullable = false)
 |-- pct_housing_crowded: double (nullable = false)
 |-- pct_households_below_poverty: double (nullable = false)
 |-- pct_age16_unemployed: double (nullable = false)
 |-- pct_age25_no_highschool: double (nullable = false)
 |-- pct_not_working_age: double (nullable = false)
 |-- per_capita_income: double (nullable = false)
 |-- hardship_index: double (nullable = false)
 |-- primary_type_ARSON: long (nullable = true)
 |-- primary_type_CRIM SEXUAL ASSAULT: long (nullable = true)
 |-- primary_type_CRIMINAL DAMAGE: long (nullable = true)
 |-- primary_type_CRIMINAL TRESPASS: long (nullable = true)
 |-- primary_type_DECEPTIVE PRACTICE: long (nullable = true)
 |-- primary_type_GAMBLING: long (nullable = true)
 |-- primary_type_HOMICIDE: long (nullable = true)
 |-- primary_type_INTERFERENCE WITH PUBLIC OFFICER: long (nullable = true)
 |-- primary_type_INTIMIDATION: long (nullable = true)
 |-- primary_type_KIDNAPPING: long (nullable = true)
 |-- primary_t

In [None]:
obj_model_regression = model_regression(config, df_ml)

In [None]:
model = obj_model_regression.train_model()

### the model will be tested on 2014 data

In [None]:
%%time
obj_df_loaded_2014 = LoadDataframe(config, '2014', '2015')

### Loads data for testing

In [None]:
df_nb_crimes_2014 = obj_df_loaded.df_nb_crimes()

### Extract features for 2014 data

In [None]:
obj_extract_features_regression_2014 =  extract_features_regression(config, df_nb_crimes_2014)
df_ml_2014 = obj_extract_features_regression_2014.extract_feature()

### Prediction

In [None]:
predictions = model.transform(df_ml_2014)

### Show some results

In [None]:
prediction_df = predictions.select('label','prediction').toPandas()

## Score 

In [None]:
predictions.printSchema()

In [None]:
visualisation_prediction(prediction_df['label'], prediction_df['prediction'])

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error

In [None]:
r2_score(prediction_df['label'], prediction_df['prediction'])

In [None]:
def plot_result(type_crime, area_name):
    coulumns_to_filter = [type_crime, area_name, 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',\
              'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'label', 'prediction']
    df = predictions.select(*coulumns_to_filter).toPandas()
    ax = df.plot.bar(rot=0, figsize=(15,10), fontsize=10, title= type_crime.replace('primary_type_', "") + " in " + \
                     name_area.replace('community_area_name_', '') + " "  + title)

    
 

In [None]:
plot_result('primary_type_ASSAULT', 'community_area_name_Austin')