In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from time import time
import matplotlib.pyplot as plt

# This is function for project1

## 1) Function with white markdown is subfunction
## 2) Function with blue markdown is aggregate function from subfunction ([You only need to use function with blue markdown])

### show_head_text is function for read txt file and show head -n line

In [None]:
def show_head_text(fileName, n_line):

    f = open(fileName, "r")
    for i in range(n_line):
        print(f.readline())
    f.close()

### convert text file to dataframe without any edit format

In [None]:
def txt_to_df(fileName):

    f = open(fileName,"r")

    ### count for determine line
    count = 0

    # metadata_df = pd.DataFrame(columns = header)
    info_data = dict()
    for line in f:
        count += 1
        tmp = line

        if count < 10:
            head = tmp[2:].split(":")[0]
            data = tmp[2:].split(":")[1][1:-1]
            info_data[head] = [data]

        ### This line we determine that is header
        elif count == 10:

            ### We remove "% " at first 2 string and split each string 
            header = tmp[2:].split(", ")

            ### The last word will contains "\n" so we remove it
            header[-1] = header[-1][:-1]
            
            ### Create dataframe to support data will be coming after this line
            df = pd.DataFrame(columns = header)

        else:
            data = tmp.split()
            data_df = pd.DataFrame([data], columns = header)
            df = df.append(data_df)
    
    f.close()
    
    df = df.reset_index().drop(columns = ["index"])
    info_df = pd.DataFrame.from_dict(info_data)
    return info_df, df

### convert any time unit with len one eg. 7 => 07

In [None]:
def make_len2(time):
    if len(time) == 1:
        time = "0" + time
    return time

### convert all specfic headers to len 2 by use make_len2

In [None]:
def all_make_len2(df, headers):

    for header in headers:  
        df[header] = df[header].apply(lambda x: make_len2(x))
    return df

### create datetime columns by input specific (eg. hour, month ,day, utc hour)

In [None]:
def create_datetime(df, headers, string_format):
    df["datestr"] = ""
    for header in headers:
        df["datestr"] += df[header]
    df["datetime"] = pd.to_datetime(df["datestr"], format=string_format)
    return df

### Shift datetime to specific utc

In [None]:
def shiftTime(hour):
    df["datetime"] = df["datetime"] + timedelta(hours=hour)
    df["UTC Hour"] = df["UTC Hour"] + hour
    return df["datetime"]

### Extract string datetime for pm2.5

In [None]:
def extract_pm_datetime(df):
    df["Year"] = df["datetime_str"].apply(lambda x: x[:4])
    df["Month"] = df["datetime_str"].apply(lambda x: x[5:7])
    df["Day"] = df["datetime_str"].apply(lambda x: x[8:10])
    df["UTC Hour"] = df["datetime_str"].apply(lambda x: x[11:13])
    df.drop(columns = ["datetime_str"], inplace = True)
    return df

### merege all function before to convert txt file to df with shift UTC

In [None]:
def txt_to_df_with_shift_datetime(fileName):

    info_df, df = txt_to_df(fileName)
    header_to_numeric = ["PM2.5", "PM10_mask", "Retrospective"]
    for header in header_to_numeric:
        df[header] = pd.to_numeric(df[header])
    df = all_make_len2(df, ["Month", "Day", "UTC Hour"])
    df = create_datetime(df, ["Year","Month","Day","UTC Hour"], "%Y%m%d%H")
    df.drop(columns = ["datestr"], inplace = True)
    df["datetime"] = df["datetime"] + timedelta(hours=7)
    df["datetime_str"] = df["datetime"].astype(str)
    df = extract_pm_datetime(df)
    
    return info_df, df

### check datetime value is out of range or not

In [None]:
def check_city_feature_unique(df, headers, city):
    print("-"*30)
    for header in headers:
        print(city + " " + header)
        print(df[header].unique())
        print("-"*30)

### Check null value all columns in dataframe

In [None]:
def check_null_all(df):
    null_counts = df.isnull().sum()
    print("Number of null values in each column:\n{}".format(null_counts))

### See trend in pm2.5 each year

In [None]:
def pm_trend_each_year(df):

    df_plot1 = df[df["Year"] == 2016].groupby(["Month"]).mean()
    plt.plot([i for i in range(3,13)], df_plot1["PM2.5"], label = "year 2016")
    
    df_plot2 = df[df["Year"] == 2017].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)], df_plot2["PM2.5"], label = "year 2017")

    df_plot3 = df[df["Year"] == 2018].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)], df_plot3["PM2.5"], label = "year 2018")

    df_plot4 = df[df["Year"] == 2019].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,4)], df_plot4["PM2.5"], label = "year 2019")

    plt.legend()

    plt.show()

### Extract date string for wind and temp

In [None]:
def extract_date_string(df):    
    df["Year"] = df["datetime"].apply(lambda x: int(x[:4]))
    df["Month"] = df["datetime"].apply(lambda x: int(x[5:7]))
    df["Day"] = df["datetime"].apply(lambda x: int(x[8:10]))
    df["UTC Hour"] = df["datetime"].apply(lambda x: int(x[11:13]))
    return df

### See wind direct trend each year

In [None]:
def wind_direct_trend_each_year(df):

    df_plot1 = df[df["datetime"] < datetime(2017,1,1,0,0,0)].groupby(["Month"]).mean()
    plt.plot([i for i in range(3,13)],df_plot1["WindDir"],label = "year 2016")
    
    df_plot2 = df[ (datetime(2017,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2018,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)],df_plot2["WindDir"],label = "year 2017")
    
    df_plot1 = df[(datetime(2018,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2019,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)],df_plot1["WindDir"],label = "year 2018")
    
    df_plot1 = df[  (datetime(2019,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2020,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,4)],df_plot1["WindDir"],label = "year 2019")

    plt.legend()
    plt.show()

### See wind speed trend each year

In [None]:
def wind_speed_trend_each_year(df):

    df_plot1 = df[df["datetime"] < datetime(2017,1,1,0,0,0)].groupby(["Month"]).mean()
    plt.plot([i for i in range(3,13)],df_plot1["Wind Speed(km/h)"],label = "year 2016")
    
    df_plot2 = df[ (datetime(2017,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2018,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)],df_plot2["Wind Speed(km/h)"],label = "year 2017")
    
    df_plot1 = df[(datetime(2018,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2019,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)],df_plot1["Wind Speed(km/h)"],label = "year 2018")
    
    df_plot1 = df[  (datetime(2019,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2020,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,4)],df_plot1["Wind Speed(km/h)"],label = "year 2019")

    plt.legend()
    plt.show()

### See temp trend each year

In [None]:
def temp_trend_each_year(df):

    df_plot1 = df[df["datetime"] < datetime(2017,1,1,0,0,0)].groupby(["Month"]).mean()
    plt.plot([i for i in range(3,13)],df_plot1["Temp(C)"],label = "year 2016")
    
    df_plot2 = df[ (datetime(2017,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2018,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)],df_plot2["Temp(C)"],label = "year 2017")
    
    df_plot1 = df[(datetime(2018,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2019,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)],df_plot1["Temp(C)"],label = "year 2018")
    
    df_plot1 = df[  (datetime(2019,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2020,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,4)],df_plot1["Temp(C)"],label = "year 2019")

    plt.legend()
    plt.show()

### convert hotspot acq_time and acq_date to datetime and extract to each field

In [None]:
def add_zero_acq_time(date):
    
    convert_date = str(date)
    if len(convert_date) == 2:
        return "00"+convert_date
    elif len(convert_date) == 3:
        return "0"+convert_date
    else:
        return convert_date

def hotspot_convert_datetime(df):

    df["acq_time_str"] = df["acq_time"].apply(lambda x: add_zero_acq_time(x))
    
    df["Year"] = df["acq_date"].apply(lambda x: int(x[:4]))
    df["Month"] = df["acq_date"].apply(lambda x: int(x[5:7]))
    df["Day"] = df["acq_date"].apply(lambda x: int(x[8:10]))
    df["UTC Hour"] = df["acq_time_str"].apply(lambda x: int(str(x)[:-2]))
    df["UTC Min"] = df["acq_time_str"].apply(lambda x: int(str(x)[-2:]))

    df["datetime_str"] = df["Year"].astype(str)+"-" \
                            +df["Month"].astype(str)+"-" \
                            +df["Day"].astype(str)+" " \
                            +df["UTC Hour"].astype(str)+":" \
                            +df["UTC Min"].astype(str)+":" \
                            +"00"

    df["datetime"] = df["datetime_str"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
    df["datetime"] = df["datetime"] + timedelta(hours = 7)

    df["datetime_str"] = df["datetime"].astype(str)
    df["Year"] = df["datetime_str"].apply(lambda x: int(x[:4]))
    df["Month"] = df["datetime_str"].apply(lambda x: int(x[5:7]))
    df["Day"] = df["datetime_str"].apply(lambda x: int(x[8:10]))
    df["UTC Hour"] = df["datetime_str"].apply(lambda x: int(str(x)[11:13]))
    df["UTC Min"] = df["datetime_str"].apply(lambda x: int(str(x)[14:16]))
    df.drop(columns = ["datetime_str"], inplace = True)
    return df


In [None]:
def hotspot_trend_each_year(df, feature):

    df_plot1 = df[df["datetime"] < datetime(2017,1,1,0,0,0)].groupby(["Month"]).mean()
    plt.plot([i for i in range(3,13)],df_plot1["feature"],label = "year 2016")
    
    df_plot2 = df[ (datetime(2017,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2018,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)],df_plot2["feature"],label = "year 2017")
    
    df_plot1 = df[(datetime(2018,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2019,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,13)],df_plot1["feature"],label = "year 2018")
    
    df_plot1 = df[  (datetime(2019,1,1,0,0,0) <= df["datetime"]) & \
       (df["datetime"] < datetime(2020,1,1,0,0,0))].groupby(["Month"]).mean()
    plt.plot([i for i in range(1,4)],df_plot1["feature"],label = "year 2019")

    plt.legend()
    plt.show()