In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline 

In [None]:
from yaml import load as yaml_load
import findspark
findspark.init()
import pyspark
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession
import pyspark.sql.functions as func
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StandardScaler, VectorAssembler, VectorIndexer, StandardScaler
from pyspark.sql.functions import *
from pyspark.sql.functions import pandas_udf, PandasUDFType
spark = SparkSession.builder.master("local").appName("Data cleaning").getOrCreate()

In [None]:
import os
import sys
sys.path.append('../')

In [None]:
sc = spark.sparkContext

In [None]:
import json

In [None]:
from src.data.make_dataset import LoadDataframe

In [None]:
def _load_config_file(config_file):
    """
    Load configuration file
    :param config_file: is the configuration file
    :return: configuration
    :rtype: dict
    """
    with open(config_file) as yml_config:
        return yaml_load(yml_config)

def _build_configuration(config_file):
    """
    Build the operation configuration dict
    :param config_file: is the path to the yaml config_file
    :type: string
    :return: config: global configuration
    :rtype dict
    """
    # yaml config
    config = _load_config_file(config_file)
    return config
def visualisation_prediction(y_test, y_pred):
    import matplotlib
    import matplotlib.pyplot as plt
    matplotlib.rc('xtick', labelsize=30) 
    matplotlib.rc('ytick', labelsize=30) 
    fig, ax = plt.subplots(figsize=(50, 40))
    plt.style.use('ggplot')
    plt.plot(y_pred, y_test, 'ro')
    plt.xlabel('Predicted Crime', fontsize = 30)
    plt.ylabel('Actual Crime', fontsize = 30)
    plt.title('Predicted Y (Crimes) to the Actual Y (Crimes)', fontsize = 30)
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
config_file = "/home/ml/Documents/crimes_chigaco/config/config.yml"
config = _build_configuration(config_file)

In [None]:
%%time
obj_df_loaded = LoadDataframe(config, '2013', '2014')

In [None]:
df_temp = obj_df_loaded.df_temperature()
df_sky  = obj_df_loaded.df_sky()

In [None]:
df_socio = obj_df_loaded.df_socio()

In [None]:
df_crime = obj_df_loaded.df_crime()

In [None]:
df_crime.count()

In [None]:
df_crime_socio = df_crime.join(df_socio, ['community_area_number'], "inner")

In [None]:
df_crime_socio.count()

In [None]:
dfp = df_crime_socio.limit(200).toPandas()

In [None]:
import pandas as pd
from pyspark.sql.functions import col, pandas_udf, udf
from pyspark.sql.types import LongType, StringType, FloatType
import re

In [None]:
def duration_day_func(x):
    """
    :return:
    """
    from astral import Astral
    city_name = 'Chicago'
    a = Astral()
    a.solar_depression = 'civil'
    city = a[city_name]
    #sun = city.sun(date=date, local=True)
    sun = city.sun(date=x, local=True)
    return float((sun['sunset'] - sun['sunrise']).total_seconds())


In [None]:
#dfp['duree_day'] = dfp['date'].apply(lambda x: duration_day(x))

In [None]:
extract_blok = udf(lambda x : re.findall(r"(\w+)$", x)[0], StringType())
isStreet = udf(lambda x :  1 if x in ['ST', 'St', 'st'] else 0)
isAV = udf(lambda x : 1 if x in ['Ave', 'AV', 'AVE'] else 0)
isBLVD = udf(lambda x : 1 if x in ['BLVD'] else 0)
isRD = udf(lambda x : 1 if x in ['RD'] else 0)
isPL = udf(lambda x : 1 if x in ['PL', 'pl'] else 0)
isBROADWAY = udf(lambda x : 1 if x in ['BROADWAY', 'Broadway'] else 0)
isPKWY = udf(lambda x : 1 if x in ['PKWY', 'Pkwy'] else 0)
duration_day_udf = udf(lambda x :   duration_day_func(x),  FloatType())

In [None]:
df_crime_socio = df_crime_socio.withColumn("block_extract", extract_blok(df_crime_socio.block))

In [None]:
df_crime_socio = df_crime_socio.withColumn("isStreet", isStreet(df_crime_socio.block_extract)).withColumn("isAV", isAV(df_crime_socio.block_extract)).withColumn("isBLVD", isBLVD(df_crime_socio.block_extract)).withColumn("isRD", isRD(df_crime_socio.block_extract)).withColumn("isPL", isPL(df_crime_socio.block_extract)).withColumn("isBROADWAY", isBROADWAY(df_crime_socio.block_extract)).withColumn("isPKWY", isPKWY(df_crime_socio.block_extract))

In [None]:
df_crime_socio.limit(500).toPandas().sample(20)

In [None]:
df_crime_socio = df_crime_socio.withColumn('duree_day', duration_day_udf('date'))

In [None]:
df_crime_socio.columns

In [None]:
df_crime_socio.printSchema()

In [None]:
df_crime_socio = (
df_crime_socio.withColumn("month", func.month(func.col("date"))).
withColumn("year", func.year(func.col("date"))).
withColumn("day", func.dayofmonth(func.col("date"))).
withColumn("hour", func.hour(func.col("date"))).withColumn("minute", func.minute(func.col("date"))).
withColumn("dayofmonth", func.dayofmonth(func.col("date"))).   
withColumn("dayofyear", func.dayofyear(func.col("date"))).
withColumn("dayofweek", func.dayofweek(func.col("date")))
)

In [None]:
df_crime_socio.limit(1000).toPandas().sample(10)

In [None]:
df_crime_socio.count()

In [None]:
df_total = df_crime_socio.join(df_temp, ['year', 'month','day','hour'], how = "left")

In [None]:
df_total.count()

In [None]:
df_total = df_total.dropDuplicates()

In [None]:
df_total.count()

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [None]:
categoricalColumns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
stages = []