<h1> Practice Machine Learning Project with Weather Data</h1>
<h3> Step 1: Load the weather data from the csv files in the dataset folder</h3>

In [104]:
import pandas as pd

def load_weather_data(csv_file):
    city_attrib_csv_path = "datasets/historical-hourly-weather-data/" + csv_file
    return pd.read_csv(city_attrib_csv_path)

<h3>Step 2: Assign the loaded weather data to values for the new table</h3>

In [173]:
city_attributes = load_weather_data("city_attributes.csv")
humidity = load_weather_data("humidity.csv")
pressure = load_weather_data("pressure.csv")
temperature = load_weather_data("temperature.csv")
weather_description = load_weather_data("weather_description.csv")
wind_direction = load_weather_data("wind_direction.csv")
wind_speed = load_weather_data("wind_speed.csv")

<h3>Step 3: Create a new dataset for a specific city</h3>

In [174]:
def load_city_dataset(city_name):
    city_dataset = pd.DataFrame(data=humidity["datetime"])
    
    # create new columns for month, day, hour_of_day
    # this will be used to split up data in "datatime" column
    city_dataset["month"] = ""
    city_dataset["day"] = ""
    city_dataset["hour_of_day"] = ""
    
    # seperate the values for datetime into month, day, hour_of_day int columns
    for i, date in enumerate(city_dataset["datetime"]):
        date, time = date.split(" ", 1)
        year, month, day = date.split("-", 2)
        hours, minute_seconds = time.split(":", 1)
        
        city_dataset["month"][i] = int(month)
        city_dataset["day"][i] = int(day)
        city_dataset["hour_of_day"][i] = int(hours)

    # drop datetime column
    city_dataset = city_dataset.drop("datetime", axis=1)
        
    # create new columns
    # assign weather data for the city to columns
    city_dataset["humidity"] = humidity[city_name]
    city_dataset["pressure"] = pressure[city_name]
    city_dataset["temperature"] = temperature[city_name]
    city_dataset["weather_description"] = weather_description[city_name]
    city_dataset["wind_direction"] = wind_direction[city_name]
    city_dataset["wind_speed"] = wind_speed[city_name]
    
    return city_dataset

In [175]:
new_york_weather = load_city_dataset("New York")

In [176]:
new_york_weather

Unnamed: 0,month,day,hour_of_day,humidity,pressure,temperature,weather_description,wind_direction,wind_speed
0,10,1,12,,,,,,
1,10,1,13,58.0,1012.0,288.220000,few clouds,260.0,7.0
2,10,1,14,57.0,1012.0,288.247676,few clouds,260.0,7.0
3,10,1,15,57.0,1012.0,288.326940,few clouds,260.0,7.0
4,10,1,16,57.0,1012.0,288.406203,few clouds,260.0,7.0
...,...,...,...,...,...,...,...,...,...
45248,11,29,20,,,,,,
45249,11,29,21,,,,,,
45250,11,29,22,,,,,,
45251,11,29,23,,,,,,


<h3>Step 4: Clean the weather data</h3>

In [177]:
def clean_weather_data (data, method):
    
    # replace NaN values with median of column
    if method == "median" :
        median_humidity = data["humidity"].median()
        data["humidity"].fillna(median_humidity, inplace=True)
        median_pressure = data["pressure"].median()
        data["pressure"].fillna(median_pressure, inplace=True)
        median_temperature = data["temperature"].median()
        data["temperature"].fillna(median_temperature, inplace=True)
        median_wind_direction = data["wind_direction"].median()
        data["wind_direction"].fillna(median_wind_direction, inplace=True)
        median_wind_speed = data["wind_speed"].median()
        data["wind_speed"].fillna(median_wind_speed, inplace=True)
        
    # replace NaN values with mean of column
    if method == "mean" :
        mean_humidity = data["humidity"].mean()
        data["humidity"].fillna(mean_humidity, inplace=True)
        mean_pressure = data["pressure"].mean()
        data["pressure"].fillna(mean_pressure, inplace=True)
        mean_temperature = data["temperature"].mean()
        data["temperature"].fillna(mean_temperature, inplace=True)
        mean_wind_direction = data["wind_direction"].mean()
        data["wind_direction"].fillna(mean_wind_direction, inplace=True)
        mean_wind_speed = data["wind_speed"].mean()
        data["wind_speed"].fillna(mean_wind_speed, inplace=True)
    
    # drop rows with NaN value(s)
    if method == "drop" :
        data = data.dropna()
        
    return data

In [198]:
clean_ny_weather = clean_weather_data(new_york_weather, method="drop")

In [199]:
clean_ny_weather

Unnamed: 0,month,day,hour_of_day,humidity,pressure,temperature,weather_description,wind_direction,wind_speed
1,10,1,13,58.0,1012.0,288.220000,few clouds,260.0,7.0
2,10,1,14,57.0,1012.0,288.247676,few clouds,260.0,7.0
3,10,1,15,57.0,1012.0,288.326940,few clouds,260.0,7.0
4,10,1,16,57.0,1012.0,288.406203,few clouds,260.0,7.0
5,10,1,17,57.0,1012.0,288.485467,few clouds,261.0,6.0
...,...,...,...,...,...,...,...,...,...
44456,10,27,20,36.0,1019.0,289.980000,sky is clear,0.0,3.0
44457,10,27,21,38.0,1019.0,289.480000,sky is clear,0.0,1.0
44458,10,27,22,54.0,1019.0,287.920000,sky is clear,196.0,2.0
44459,10,27,23,62.0,1020.0,285.830000,sky is clear,171.0,3.0


<h3>Step 5: Split the data into a training and testing set</h3>

In [241]:
from sklearn.model_selection import train_test_split

training_set, testing_set = train_test_split(clean_ny_weather, test_size=0.2, random_state=21)

In [242]:
weather_description = training_set[["weather_description"]]
train_labels = training_set["temperature"]
training_set = training_set.drop("temperature", axis=1)

<h3>Step 6: Quantify the weather_description column</h3>

In [243]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoded_desc = encoder.fit_transform(weather_description)
encoded_desc.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [244]:
encoder.categories_

[array(['broken clouds', 'drizzle', 'dust', 'few clouds', 'fog',
        'freezing rain', 'haze', 'heavy intensity drizzle',
        'heavy intensity rain', 'heavy snow', 'heavy thunderstorm',
        'light intensity drizzle', 'light rain', 'light rain and snow',
        'light snow', 'mist', 'moderate rain', 'overcast clouds',
        'proximity thunderstorm', 'proximity thunderstorm with drizzle',
        'proximity thunderstorm with rain', 'sand', 'scattered clouds',
        'shower rain', 'sky is clear', 'smoke', 'snow', 'squalls',
        'thunderstorm', 'thunderstorm with heavy rain',
        'thunderstorm with light drizzle', 'thunderstorm with light rain',
        'thunderstorm with rain', 'very heavy rain'], dtype=object)]

<h3>Step 7: Pipelines and feature scaling</h3>

In [245]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

numbers_pipe = Pipeline([
     ('std_scaler', StandardScaler()),
])

train_num = training_set.drop("weather_description", axis=1)
weather_numbers_scaled = numbers_pipe.fit_transform(train_num)

In [246]:
weather_numbers_scaled

array([[ 0.6972499 , -1.10570426,  1.66113432, ..., -3.14123339,
         0.52864113, -1.48087155],
       [-0.1744145 , -1.56226907,  1.51660807, ..., -0.10204833,
        -0.43122758, -0.56040251],
       [ 0.9878047 , -1.21984546, -0.94033826, ..., -0.10204833,
        -1.36313896,  1.28053557],
       ...,
       [ 0.1161403 , -1.33398666,  0.64945054, ...,  0.58421926,
         0.22111038,  0.82030105],
       [-0.1744145 , -0.87742185,  1.51660807, ..., -1.18046884,
        -1.27926693, -0.56040251],
       [ 0.4066951 , -1.21984546,  0.79397679, ..., -0.10204833,
         0.96663948, -1.02063703]])

In [247]:
from sklearn.compose import ColumnTransformer

num_attribs = list(train_num)
cat_attribs = ["weather_description"]

full_pipeline = ColumnTransformer([
    ("num", numbers_pipe, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

ny_weather_prepared = full_pipeline.fit_transform(training_set)

In [248]:
ny_weather_prepared

<34887x41 sparse matrix of type '<class 'numpy.float64'>'
	with 279096 stored elements in Compressed Sparse Row format>

In [249]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(ny_weather_prepared, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [250]:
some_data = training_set.iloc[:5]
some_labels = train_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

In [252]:
lin_reg.predict(some_data_prepared)

array([299.92600167, 288.67132277, 283.264101  , 276.79107806,
       291.69258759])

In [253]:
some_labels

8171     292.35
14626    300.16
8849     292.51
3473     269.07
41020    286.15
Name: temperature, dtype: float64

In [254]:
from sklearn.metrics import mean_squared_error

weather_predictions = lin_reg.predict(ny_weather_prepared)
lin_mse = mean_squared_error(train_labels, weather_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

8.837682211966962

In [255]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(ny_weather_prepared, train_labels)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [257]:
weather_predictions = tree_reg.predict(ny_weather_prepared)
tree_mse = mean_squared_error(train_labels, weather_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

6.330921323624101e-15

<h3>Better evaluations using cross validation</h3>

In [258]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, ny_weather_prepared, train_labels,
                        scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [259]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(tree_rmse_scores)

Scores: [3.7239201  3.65996003 3.41341984 3.37626647 3.54786102 3.55730331
 3.30189188 3.49970814 3.49508355 3.47565707]
Mean: 3.505107139626083
Standard deviation: 0.119879402232171


In [261]:
lin_scores = cross_val_score(lin_reg, ny_weather_prepared, train_labels,
                        scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

In [262]:
display_scores(lin_rmse_scores)

Scores: [8.82102154 9.02742041 8.76685186 8.85538158 8.89902648 8.8747015
 8.72528126 8.81591809 8.87457479 8.78613381]
Mean: 8.844631133538273
Standard deviation: 0.07961468989744755


<h3>Random forest performance</h3>

In [263]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(ny_weather_prepared, train_labels)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [265]:
forest_scores = cross_val_score(forest_reg, ny_weather_prepared, train_labels,
                               scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

In [266]:
display_scores(forest_rmse_scores)

Scores: [2.4196389  2.52560749 2.34236737 2.36038309 2.46753083 2.39358889
 2.31698651 2.34834691 2.45631114 2.42046782]
Mean: 2.405122894518355
Standard deviation: 0.06208234816228608


<h3>Random forest is performing the best so far so lets fine tune the model</h3>

In [284]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [300], 'max_features': [14, 20], 'bootstrap': [False, True]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True)

grid_search.fit(ny_weather_prepared, train_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [285]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 20, 'n_estimators': 300}

In [286]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.69353195e-01, 5.21635584e-02, 5.72215468e-02, 4.01011191e-02,
       8.13465277e-02, 5.26358868e-02, 2.11429550e-02, 2.34875675e-03,
       1.41006316e-04, 1.78082556e-05, 1.83871072e-03, 9.67359327e-04,
       1.31840333e-05, 4.99843206e-04, 6.46699566e-06, 2.28533176e-04,
       2.25770766e-04, 6.88245252e-07, 2.29691690e-04, 2.28609545e-03,
       7.36247710e-06, 1.45339638e-03, 2.45058622e-03, 6.17977638e-04,
       2.41499341e-03, 8.92112141e-05, 2.31432183e-07, 1.33290913e-07,
       5.21806839e-07, 1.74146494e-03, 4.14008089e-07, 8.05250803e-03,
       5.92828852e-06, 2.22290375e-04, 1.99182107e-05, 6.17202192e-05,
       3.05520104e-06, 3.49104537e-07, 2.09700280e-05, 3.57274571e-06,
       6.46914505e-05])

In [287]:
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.6693531947717006, 'month'),
 (0.08134652773500912, 'pressure'),
 (0.057221546794184015, 'hour_of_day'),
 (0.052635886809405445, 'wind_direction'),
 (0.05216355840582447, 'day'),
 (0.04010111905360113, 'humidity'),
 (0.021142955039363105, 'wind_speed'),
 (0.0080525080279215, 'sky is clear'),
 (0.0024505862211561744, 'mist'),
 (0.0024149934071131325, 'overcast clouds'),
 (0.0023487567450449047, 'broken clouds'),
 (0.002286095445860381, 'light rain'),
 (0.001838710718388625, 'few clouds'),
 (0.0017414649405073708, 'scattered clouds'),
 (0.0014533963838589356, 'light snow'),
 (0.0009673593268625471, 'fog'),
 (0.0006179776380399563, 'moderate rain'),
 (0.0004998432063751955, 'haze'),
 (0.00022969168993240027, 'light intensity drizzle'),
 (0.0002285331763480856, 'heavy intensity rain'),
 (0.00022577076582211896, 'heavy snow'),
 (0.00022229037489179557, 'snow'),
 (0.00014100631562554902, 'drizzle'),
 (8.921121406820866e-05, 'proximity thunderstorm'),
 (6.469145050474444e-05, 'very heavy r

In [288]:
best_estimator = grid_search.best_estimator_
best_estimator_score = cross_val_score(best_estimator, ny_weather_prepared, train_labels,
                               scoring="neg_mean_squared_error", cv=10)
estimator_rmse_scores = np.sqrt(-best_estimator_score)

<h3>2.57 mean with n_estimators = 300 and max_features = 14

In [289]:
display_scores(estimator_rmse_scores)

Scores: [2.23782546 2.36477991 2.18810495 2.22024928 2.28750892 2.22907451
 2.13492649 2.18787592 2.27974643 2.26695363]
Mean: 2.239704550800286
Standard deviation: 0.06090744629135487
