In [88]:
# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d sobhanmoosavi/us-accidents
# !unzip us-accidents.zip

In [89]:
# !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
# !python rapidsai-csp-utils/colab/pip-install.py

In [90]:
import numpy as np
import pandas as pd
import time
from tabulate import tabulate
import cudf
from sklearn.preprocessing import LabelEncoder

In [91]:
df=cudf.read_csv("US_Accidents_March23.csv")

In [92]:
df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype
---  ------                 -----
 0   ID                     object
 1   Source                 object
 2   Severity               int64
 3   Start_Time             object
 4   End_Time               object
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object
 11  Street                 object
 12  City                   object
 13  County                 object
 14  State                  object
 15  Zipcode                object
 16  Country                object
 17  Timezone               object
 18  Airport_Code           object
 19  Weather_Timestamp      object
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)            float64
 23  

In [93]:
df.drop(columns=["ID", "Description", "Country","Source","Timezone"], inplace=True)
df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 41 columns):
 #   Column                 Dtype
---  ------                 -----
 0   Severity               int64
 1   Start_Time             object
 2   End_Time               object
 3   Start_Lat              float64
 4   Start_Lng              float64
 5   End_Lat                float64
 6   End_Lng                float64
 7   Distance(mi)           float64
 8   Street                 object
 9   City                   object
 10  County                 object
 11  State                  object
 12  Zipcode                object
 13  Airport_Code           object
 14  Weather_Timestamp      object
 15  Temperature(F)         float64
 16  Wind_Chill(F)          float64
 17  Humidity(%)            float64
 18  Pressure(in)           float64
 19  Visibility(mi)         float64
 20  Wind_Direction         object
 21  Wind_Speed(mph)        float64
 22  Precipitation(in)      float64
 

In [94]:
df['Precipitation(in)'] = df['Precipitation(in)'].fillna(0)
df['End_Lat'] = df['End_Lat'].fillna(df['Start_Lat'])
df['End_Lng'] = df['End_Lng'].fillna(df['Start_Lng'])
df['Precipitation(in)'] = df['Precipitation(in)'].fillna(0)
df['Wind_Speed(mph)'] = df['Wind_Speed(mph)'].fillna(df['Wind_Speed(mph)'].median())
df['Pressure(in)'] = df['Pressure(in)'].fillna(df['Pressure(in)'].median())
df['Visibility(mi)'] = df['Visibility(mi)'].fillna(df['Visibility(mi)'].median())
df['Weather_Timestamp'] = df['Weather_Timestamp'].fillna(df['Start_Time'])
df['Zipcode'] = df['Zipcode'].fillna("00000")
df['Humidity(%)'] = df['Humidity(%)'].fillna(df['Humidity(%)'].median())

In [95]:
text_columns = [
    "City", "State", "Weather_Condition","Street",
    "Wind_Direction", "Sunrise_Sunset", "Civil_Twilight",
    "Nautical_Twilight", "Astronomical_Twilight","County"
]

for c in text_columns:
    encoder = LabelEncoder()
    encoded = encoder.fit_transform(df[c].to_pandas().astype(str))
    df[c] = cudf.Series(encoded).astype("uint16")

In [96]:
df["Airport_Code"] = cudf.Series(encoder.fit_transform(df["Airport_Code"].to_pandas())).astype("uint8")

df['Humidity(%)'] = df['Humidity(%)'].astype('uint8')
df['Temperature(F)'] = df['Temperature(F)'].astype('int16')
df['Wind_Chill(F)'] = df['Wind_Chill(F)'].astype('int8')
df['Visibility(mi)'] = df['Visibility(mi)'].astype('int16')
df['Wind_Speed(mph)'] = df['Wind_Speed(mph)'].astype('int16')

In [97]:
cols = ["Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit",
        "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming",
        "Traffic_Signal", "Turning_Loop","Humidity(%)","Temperature(F)"]

for col in cols:
    df[col] = df[col].astype("uint8")

df['Temperature(F)'] = df['Temperature(F)'].fillna(df['Temperature(F)'].median())
df['Wind_Chill(F)'] = df['Wind_Chill(F)'].fillna(df['Wind_Chill(F)'].median())


In [98]:
float_cols = ["Distance(mi)", "Pressure(in)", "Precipitation(in)"]

for col in float_cols:
    df[col] = df[col].round(1).astype("float32")


In [99]:
def auto_data_type(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        min_value = df[col].min()
        max_value = df[col].max()

        if pd.api.types.is_float_dtype(df[col]):
            df[col] = df[col].astype(np.float32)
        elif min_value >= 0:
            if max_value <= 255:
                df[col] = df[col].astype(np.uint8)
            elif max_value <= 65535:
                df[col] = df[col].astype(np.uint16)
            elif max_value <= 4294967295:
                df[col] = df[col].astype(np.uint32)
            else:
                df[col] = df[col].astype(np.uint64)
        else:
            if -128 <= min_value and max_value <= 127:
                df[col] = df[col].astype(np.int8)
            elif -32768 <= min_value and max_value <= 32767:
                df[col] = df[col].astype(np.int16)
            elif -2147483648 <= min_value and max_value <= 2147483647:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
    return df


converted_df=auto_data_type(df)

In [101]:
converted_df.info()

Severity                 0
Start_Time               0
End_Time                 0
Start_Lat                0
Start_Lng                0
End_Lat                  0
End_Lng                  0
Distance(mi)             0
Street                   0
City                     0
County                   0
State                    0
Zipcode                  0
Airport_Code             0
Weather_Timestamp        0
Temperature(F)           0
Wind_Chill(F)            0
Humidity(%)              0
Pressure(in)             0
Visibility(mi)           0
Wind_Direction           0
Wind_Speed(mph)          0
Precipitation(in)        0
Weather_Condition        0
Amenity                  0
Bump                     0
Crossing                 0
Give_Way                 0
Junction                 0
No_Exit                  0
Railway                  0
Roundabout               0
Station                  0
Stop                     0
Traffic_Calming          0
Traffic_Signal           0
Turning_Loop             0
S