In [1]:
import os
import findspark

In [2]:
findspark.init()

In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, explode, regexp_replace, split, col, size, array_contains, isnan, when, count, array, reverse, udf, unix_timestamp, from_unixtime, date_format, format_number, length, first
from pyspark.sql.types import ArrayType, StringType, DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy.stats import kurtosis, skew
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.svm import SVC
import math
import folium
import pickle
# from geopy.geocoders import Nominatim
from itertools import chain

In [4]:
spark_url = 'local'
spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Project')\
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")\
        .getOrCreate()
spark.conf.set("spark.sql.csv.parser.multiLine", "true")

In [5]:
# path = '/content/drive/MyDrive/traffy/bangkok_traffy.csv'
path = 'bangkok_traffy.csv'
df = spark.read.option("multiLine", "true").csv(path, header=True, inferSchema=True)
df.show(5)

+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+--------------------+--------------------+--------------+----+------------+--------------------+
|  ticket_id|               type|        organization|             comment|               photo|         photo_after|            coords|             address|subdistrict|district|            province|           timestamp|         state|star|count_reopen|       last_activity|
+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+--------------------+--------------------+--------------+----+------------+--------------------+
|2021-9LHDM6|                 {}|                null|            ไม่มีภาพ|https://storage.g...|                null|100.48661,13.79386|1867 จรัญสนิทวงศ์...|    บางพลัด| บางพล

In [6]:
null_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])
null_counts.show()

+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+------+------------+-------------+
|ticket_id|type|organization|comment|photo|photo_after|coords|address|subdistrict|district|province|timestamp|state|  star|count_reopen|last_activity|
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+------+------------+-------------+
|     2413|1550|        2640|   3911| 2089|      85624|  2019|   4433|       2089|    2092|    2393|     2233| 2026|164087|      120042|         2558|
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+------+------------+-------------+



In [7]:
# Filter the DataFrame to include only rows with null values in the "column_name" column
def check_first_null(filtered_df):
    # Check if the filtered DataFrame is empty
    if filtered_df.count() == 0:
        return f"No null values in {filtered_df}."
    else:
        first_row = filtered_df.head()
        return first_row
    
ticket_id_null_df = df.filter(df.ticket_id.isNull())
coords_null_df = df.filter(df.coords.isNull())
address_null_df = df.filter(df.address.isNull())
print(check_first_null(ticket_id_null_df), "\n")
print(check_first_null(coords_null_df), "\n")
print(check_first_null(address_null_df), "\n")

Row(ticket_id=None, type='{ถนน}', organization='สำนักงาน ป.ป.ท.,เขตจอมทอง,ฝ่ายเทศกิจ เขตจอมทอง,ผอ.เขตจอมทอง (นายณัฐพงษ์),กลุ่มกรุงธนเหนือ (รองปลัดฯ เฉลิมพล)', comment=None, photo='https://storage.googleapis.com/traffy_public_bucket/TeamChadChart/corruption_photo2.png', photo_after='https://storage.googleapis.com/traffy_public_bucket/attachment/2022-06/e9596093de70ae8abacd6574f26a2d0f4466fe9f.jpg', coords='100.45568,13.69103', address=None, subdistrict='บางขุนเทียน', district='จอมทอง', province='กรุงเทพมหานคร', timestamp='2022-06-09 23:34:34.98044+00', state='เสร็จสิ้น', star='5', count_reopen=None, last_activity='2022-06-10 11:02:34.607728+00') 

Row(ticket_id='2022-7DABXT', type='{สะพาน}', organization=None, comment='"เคยดีใจมีสายสีน้ำเงินสถานี""แยกไฟฉาย""', photo=None, photo_after=None, coords=None, address=None, subdistrict=None, district=None, province=None, timestamp=None, state=None, star=None, count_reopen=None, last_activity=None) 

Row(ticket_id='2022-7DABXT', type='{สะพาน}', 

From above cells give us the first row with null value from each column selected (ticket_id, coords, address) to tell some relationship of those null value.
1. The ticket_id is null when the state='เสร็จสิ้น', it's mean we can drop this column significantly.
2. The address has null value 2 times more than coords. In the first null row we can see both of them are null. So it might tell that if no coords, no address too and not vice versa. We'll check in next step. 

In [8]:
sub = ['coords', 'address']
df = df.dropna(how='all', subset=sub)
sub_null_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sub])
sub_null_counts.show()

+------+-------+
|coords|address|
+------+-------+
|     1|   2415|
+------+-------+



As we can see, the null coords most dissapear. So, we don't need to map any address to coords and we can also drop all remained null column.

In [9]:
df = df.dropna(how='all', subset=['coords'])
df = df.dropna(how='all', subset=['address'])

From this step, we'll use df_use to be a data for ML

In [10]:
# change state to boolean 
# df_use = df.withColumn('state', when(df.state == 'เสร็จสิ้น', 1).otherwise(0))
df_use = df.filter(df.state == 'เสร็จสิ้น')

# change type to list
df_use = df_use.withColumn("type", split(regexp_replace("type", "[{}]", ""), ","))
df_use = df_use.dropna(how='all', subset=['type'])

# change coords to pair and swap them into format [latitude, longtitude]
flatten = udf(lambda x: list(chain.from_iterable(x)), ArrayType(StringType()))
df_use = df_use.withColumn('coords', array(reverse(split(df.coords, ','))))
df_use = df_use.withColumn('coords', flatten('coords'))

# change timestamp and last_activity
df_use = df_use.withColumn('timestamp', from_unixtime(unix_timestamp(col('timestamp'), 'yyyy-MM-dd HH:mm')))
df_use = df_use.withColumn('last_activity', from_unixtime(unix_timestamp(col('last_activity'), 'yyyy-MM-dd HH:mm')))
df_use = df_use.withColumn('time_diff', (unix_timestamp(col('last_activity')) - unix_timestamp(col('timestamp'))) / 86400)
df_use = df_use.withColumn('time_diff', format_number(col('time_diff'), 0))
df_use.show(5)

+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+-------------+-------------------+---------+----+------------+-------------------+---------+
|  ticket_id|                type|        organization|             comment|               photo|         photo_after|              coords|             address|subdistrict|district|     province|          timestamp|    state|star|count_reopen|      last_activity|time_diff|
+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+-------------+-------------------+---------+----+------------+-------------------+---------+
|2021-FYJTFP|         [ความสะอาด]|          เขตบางซื่อ|             ขยะเยอะ|https://storage.g...|                null|[13.81865, 100.53...|12/14 ถนน กรุงเทพ...|       null|    nu

In [11]:
df_use.groupBy(col('star')).count().show()

+----+-----+
|star|count|
+----+-----+
|   3| 9949|
|null|97721|
|   5|45519|
|   1|13017|
|   4|20144|
|   2| 4456|
+----+-----+



In [12]:
def check_first_null(filtered_df):
    # Check if the filtered DataFrame is empty
    if filtered_df.count() == 0:
        return f"No null values in {filtered_df}."
    else:
        first_row = filtered_df.head()
        return first_row
    
star_null_df = df_use.filter(df_use.star.isNull())
print(check_first_null(star_null_df), "\n")

Row(ticket_id='2021-FYJTFP', type=['ความสะอาด'], organization='เขตบางซื่อ', comment='ขยะเยอะ', photo='https://storage.googleapis.com/traffy_public_bucket/attachment/2021-09/3063e748259afbb7171467e19b92e9cc1f1a5826.jpg', photo_after=None, coords=['13.81865', '100.53084'], address='12/14 ถนน กรุงเทพ- นนทบุรี แขวง บางซื่อ เขตบางซื่อ กรุงเทพมหานคร 10800 ประเทศไทย', subdistrict=None, district=None, province='กรุงเทพมหานคร', timestamp='2021-09-03 12:51:00', state='เสร็จสิ้น', star=None, count_reopen=None, last_activity='2022-06-04 15:34:00', time_diff='274') 



In [13]:
first_element = udf(lambda x: x[0], StringType())
# df_exploded = df_use.withColumn('type', first_element(df_use['type']))
df_exploded = df_use.withColumn('type', explode(df_use['type']))
df_exploded = df_exploded.filter("type != ''")

df_exploded.show()

+-----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+-------------+-------------------+---------+----+------------+-------------------+---------+
|  ticket_id|       type|        organization|             comment|               photo|         photo_after|              coords|             address|subdistrict|district|     province|          timestamp|    state|star|count_reopen|      last_activity|time_diff|
+-----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+-------------+-------------------+---------+----+------------+-------------------+---------+
|2021-FYJTFP|  ความสะอาด|          เขตบางซื่อ|             ขยะเยอะ|https://storage.g...|                null|[13.81865, 100.53...|12/14 ถนน กรุงเทพ...|       null|    null|กรุงเทพมหานคร|2021-09-03 12:51:00

In [14]:
df_t = df_exploded.withColumn("latitude", col("coords")[0]) \
       .withColumn("longitude", col("coords")[1]) \
       .drop("coords")
df_t = df_t.withColumn("latitude", col("latitude").cast('double'))
df_t = df_t.withColumn("longitude", col("longitude").cast('double'))

In [15]:
null_latitude = df_t.filter(df_t.latitude.isNull())
null_longitude = df_t.filter(df_t.longitude.isNull())
print(check_first_null(null_latitude), "\n")
print(check_first_null(null_longitude), "\n")

No null values in DataFrame[ticket_id: string, type: string, organization: string, comment: string, photo: string, photo_after: string, address: string, subdistrict: string, district: string, province: string, timestamp: string, state: string, star: string, count_reopen: string, last_activity: string, time_diff: string, latitude: double, longitude: double]. 

No null values in DataFrame[ticket_id: string, type: string, organization: string, comment: string, photo: string, photo_after: string, address: string, subdistrict: string, district: string, province: string, timestamp: string, state: string, star: string, count_reopen: string, last_activity: string, time_diff: string, latitude: double, longitude: double]. 



In [16]:
drop_list = ['type', 'latitude', 'longitude', 'subdistrict', 'district', 'timestamp', 'last_activity', 'organization', 'star', 'time_diff']
for e in drop_list:
    df_t = df_t.dropna(how='all', subset=[e])

## Classification

In [17]:
def train_random_forest(df, label_col, numeric_col, nominal_col):
    dummy_df = pd.get_dummies(df_select[nominal_col], drop_first=True) 
    tw_dummy_df = pd.concat([df_select, dummy_df], axis=1)
    tw_dummy_df = tw_dummy_df.drop(nominal_col, axis=1)

    y = tw_dummy_df.pop('star')
    X = tw_dummy_df
    X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.3,random_state=123)

    rfc=RandomForestClassifier(random_state=123)
    param_grid = { 
        'criterion':['gini','entropy'],
        'max_depth': [2,3,6],
        'min_samples_leaf':[2,5,10],
        'n_estimators':[100,200],
        'random_state': [123]
    }

    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
    CV_rfc.fit(X_train, y_train)
    model = CV_rfc.best_estimator_
    y_pred = model.predict(X_test)
    
    print(confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5]))
    print(classification_report(y_test, y_pred, digits=4, zero_division=0))

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    return CV_rfc, model, y_test, y_pred

In [19]:
numeric_col = []
nominal_col = ['type', 'star', 'time_diff', 'subdistrict', 'district', 'organization']
label_col = 'star'
df_select = df_t[numeric_col + nominal_col]
df_select = df_select.toPandas()
df_select = df_select.replace({"star": {"1": 1, "2": 2, "3":3, "4":4, "5":5}})

nominal_use_col = ['type', 'time_diff', 'subdistrict', 'district', 'organization']

CV_rfc, model, y_test, y_pred = train_random_forest(df_select, label_col, numeric_col, nominal_use_col)

: 

: 

In [None]:
# mlflow
mlflow.set_tracking_uri("http://localhost:5000")
experiment_name = "traffy_predict_star"
mlflow.set_experiment(experiment_name)
best_params = CV_rfc.best_params_
with mlflow.start_run():
    # log param
    mlflow.log_param('criterion', best_params['criterion'])
    mlflow.log_param('max_depth', best_params['max_depth'])
    mlflow.log_param('min_samples_leaf', best_params['min_samples_leaf'])
    mlflow.log_param('max_depth', best_params['max_depth'])
    mlflow.log_param('n_estimators', best_params['n_estimators'])
    mlflow.log_param('random_state', best_params['random_state'])
    # log metric
    mlflow.log_metric('accuracy', accuracy)
    # save model
    mlflow.sklearn.save_model(model, 'random_forest_model_1')
    mlflow.end_run()



In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/', methods=['GET'])
def hello():
    return "Hello Guys"

@app.route('/predict', methods=['POST'])
def predict_star():
    data = request.json
    test_dummy_df = pd.DataFrame(0, index=[0], columns=X_test.columns)
    old_col = test_dummy_df.columns.tolist()
    
    for e in data:
        string_value = e+'_'+input_data[e]
        test_dummy_df.loc[0, string_value] = 1
    
    predicted_star = model.predict(test_dummy_df)
    
    response = {'predicted_star': int(predicted_star[0])}
    return jsonify(response)

if __name__ == '__main__':
    app.run(host="localhost", port=8000)

[33m * Tip: There are .env or .flaskenv files present. Do "pip install python-dotenv" to use them.[0m


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://localhost:8000
Press CTRL+C to quit
127.0.0.1 - - [17/May/2023 21:26:11] "POST /predict HTTP/1.1" 200 -


: 

: 

In [None]:
# spark.stop()