In [2]:
# %pip install pandas matplotlib 
# %pip install streamlit
# Pandas is used for data manipulation
import pandas as pd

1. Problem statement and data source selection
2. Acquire the data in an accessible format
3. Identify and correct missing data points/anomalies as required: Feature extraction and Data cleaning/ sanitization
4. Prepare the data for the machine learning model:  Dimensionality reduction, Transformation and preparation
5. Train the model on the training data
6. Make predictions on the test data
7. Compare predictions to the known test set targets and calculate performance metrics
8. If performance is not satisfactory, adjust the model, acquire more data, or try a different modeling technique

In [41]:
# Read in data and display first 5 rows
df = pd.read_csv('./classOfBA/temperature.csv',na_values=('none','null'))
df.tail()   #last 5 rows dekhaucha
df.head()   #first 5 rows dekhaucha
# temp_2: max temp 2 days prior, temp_1: 1 day prior max temp

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,forecast_noaa,actual
0,2016,1,1,Fri,45,45,45.6,43,45.0
1,2016,1,2,Sat,44,45,45.7,41,44.0
2,2016,1,3,Sun,45,44,45.8,43,41.0
3,2016,1,4,Mon,44,41,45.9,44,40.0
4,2016,1,5,Tues,41,40,46.0,46,44.0


In [30]:
#Show the columns and the data size
print(df.shape)     #size of the data lai show garcha tuple ko form ma ani row kati ra column kati dekhaidincha
print(df.columns.to_list())  #to_list() method le chai columns ma vako index objects lai lists ma covert garcha

(348, 9)
['year', 'month', 'day', 'week', 'temp_2', 'temp_1', 'average', 'forecast_noaa', 'actual']


In [42]:
# Statistics for each column
df.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,forecast_noaa,actual
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,345.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,57.238506,62.550725
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,10.605746,11.823971
min,2016.0,1.0,1.0,35.0,35.0,45.1,41.0,35.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,48.0,54.0
50%,2016.0,6.0,15.0,62.5,62.5,58.2,56.0,62.0
75%,2016.0,10.0,23.0,71.0,71.0,69.025,66.0,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,77.0,92.0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           348 non-null    int64  
 1   month          348 non-null    int64  
 2   day            348 non-null    int64  
 3   week           348 non-null    object 
 4   temp_2         348 non-null    int64  
 5   temp_1         348 non-null    int64  
 6   average        348 non-null    float64
 7   forecast_noaa  348 non-null    int64  
 8   actual         345 non-null    float64
dtypes: float64(2), int64(6), object(1)
memory usage: 24.6+ KB


In [None]:
#observe the elements in column and their occurance number we can detect outliers
df.month.value_counts()        # each feature or column can be extracted as attribute of df object
df['month'].value_counts()       # indexing through column name, single column str in indexing
df[['month','temp_2']].value_counts() # multiple should be passed as list of column name

In [63]:
# indexloc[row index, column index]
df.iloc[:,:-1]    # index location to slice data
df.iloc[:,-1]   # index location to slice data
df.loc[:,['actual', 'month']]  ## column to slice data
df[df.month<10]                # filter data according to the column value, can be used to remove outliers


Unnamed: 0,year,month,day,week,temp_2,temp_1,average,forecast_noaa,actual
0,2016,1,1,Fri,45,45,45.6,43,45.0
1,2016,1,2,Sat,44,45,45.7,41,44.0
2,2016,1,3,Sun,45,44,45.8,43,41.0
3,2016,1,4,Mon,44,41,45.9,44,40.0
4,2016,1,5,Tues,41,40,46.0,46,44.0


In [70]:
# Find and delete unwanted or NA data
df.isnull().sum()
df1 = df.dropna()                # inplace = False, means it returns dataframe after dropping na
df.dropna(axis=0, inplace=True)   # we can replace also. axis=0 is row and axis=1 means column
# inplace true means it changes the df itself and returns None
df.isnull().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 345 entries, 0 to 347
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           345 non-null    int64  
 1   month          345 non-null    int64  
 2   day            345 non-null    int64  
 3   week           345 non-null    object 
 4   temp_2         345 non-null    int64  
 5   temp_1         345 non-null    int64  
 6   average        345 non-null    float64
 7   forecast_noaa  345 non-null    int64  
 8   actual         345 non-null    float64
dtypes: float64(2), int64(6), object(1)
memory usage: 27.0+ KB


In [73]:
df.describe()
df.head(10)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,forecast_noaa,actual
0,2016,1,1,Fri,45,45,45.6,43,45.0
1,2016,1,2,Sat,44,45,45.7,41,44.0
2,2016,1,3,Sun,45,44,45.8,43,41.0
3,2016,1,4,Mon,44,41,45.9,44,40.0
4,2016,1,5,Tues,41,40,46.0,46,44.0
5,2016,1,6,Wed,40,44,46.1,43,51.0
6,2016,1,7,Thurs,44,51,46.2,45,45.0
7,2016,1,8,Fri,51,45,46.3,43,48.0
8,2016,1,9,Sat,45,48,46.4,46,50.0
9,2016,1,10,Sun,48,50,46.5,45,52.0


In [75]:
# One-hot encode the data using pandas get_dummies
df = pd.get_dummies(df)   # this gives the new df with one hot encoded columns

# #Or, we can select the columns we want to encode
one_hot = pd.get_dummies(df.week)
# # Drop column as it is now encoded
data_df = df.drop('week',axis = 1)
# # Join the encoded df data_df = data_df.join(one_hot)
# one_hot
df

Unnamed: 0,year,month,day,temp_2,temp_1,average,forecast_noaa,actual,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,43,45.0,True,False,False,False,False,False,False
1,2016,1,2,44,45,45.7,41,44.0,False,False,True,False,False,False,False
2,2016,1,3,45,44,45.8,43,41.0,False,False,False,True,False,False,False
3,2016,1,4,44,41,45.9,44,40.0,False,True,False,False,False,False,False
4,2016,1,5,41,40,46.0,46,44.0,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,2016,12,27,42,42,45.2,41,47.0,False,False,False,False,False,True,False
344,2016,12,28,42,47,45.3,41,48.0,False,False,False,False,False,False,True
345,2016,12,29,47,48,45.3,43,48.0,False,False,False,False,True,False,False
346,2016,12,30,48,48,45.4,44,57.0,True,False,False,False,False,False,False


In [77]:
#let us observe the correlation between features
df.corr()
df.drop('year', axis=1).corr()

Unnamed: 0,month,day,temp_2,temp_1,average,forecast_noaa,actual,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
month,1.0,-0.009995,0.045236,0.029096,0.116514,0.126946,0.002471,-0.012292,0.000682,0.012561,-0.008821,-0.00407,0.012333,-0.0005
day,-0.009995,1.0,-0.0506,-0.006016,-0.027414,-0.027619,-0.02414,-0.011298,-0.024115,0.004367,-0.00038,0.013861,0.012513,0.00498
temp_2,0.045236,-0.0506,1.0,0.856588,0.823478,0.815926,0.806561,-0.016397,-0.016766,-0.017449,0.02767,0.004427,0.039097,-0.02128
temp_1,0.029096,-0.006016,0.856588,1.0,0.820964,0.813098,0.879505,-0.022718,0.007791,0.001614,-0.007996,-0.006623,0.021532,0.006285
average,0.116514,-0.027414,0.823478,0.820964,1.0,0.990405,0.849049,-0.025017,0.004639,-0.003791,0.021184,-0.014743,0.030276,-0.012956
forecast_noaa,0.126946,-0.027619,0.815926,0.813098,0.990405,1.0,0.839887,-0.008911,-0.006414,0.005315,0.023298,-0.025961,0.026313,-0.014127
actual,0.002471,-0.02414,0.806561,0.879505,0.849049,0.839887,1.0,0.008692,0.001417,-0.028121,-0.018978,-0.004913,0.045591,-0.004565
week_Fri,-0.012292,-0.011298,-0.016397,-0.022718,-0.025017,-0.008911,0.008692,1.0,-0.167504,-0.167504,-0.167504,-0.167504,-0.171469,-0.165507
week_Mon,0.000682,-0.024115,-0.016766,0.007791,0.004639,-0.006414,0.001417,-0.167504,1.0,-0.165541,-0.165541,-0.165541,-0.169459,-0.163567
week_Sat,0.012561,0.004367,-0.017449,0.001614,-0.003791,0.005315,-0.028121,-0.167504,-0.165541,1.0,-0.165541,-0.165541,-0.169459,-0.163567


In [78]:
#drop year column as there is no corr and dynamic range
df.drop('year', axis=1,inplace=True)
df.shape

(345, 14)

In [79]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
# features = df.iloc[:,:-1]    # index location to slice data
# labels = df.iloc[:,-1]  ## column to slice data
features = df.drop('actual', axis=1)    # index location to slice data input features
labels = df['actual']  ## column to slice data  output feature

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (276, 13)
Training Labels Shape: (276,)
Testing Features Shape: (69, 13)
Testing Labels Shape: (69,)


In [80]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import pickle

# Instantiate model
rf = RandomForestRegressor()
# rf = LinearRegression()
# Train the model on training data
rf.fit(train_features, train_labels)

# Open the file in binary mode
file_path = 'data.pickle'
with open(file_path, 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(rf, file)

In [88]:
import numpy as np
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)
np.mean(errors)
# predictions.shape

3.9534782608695647

In [89]:
# How the feature vector/input looks
test_features.columns.to_list()

['month',
 'day',
 'temp_2',
 'temp_1',
 'average',
 'forecast_noaa',
 'week_Fri',
 'week_Mon',
 'week_Sat',
 'week_Sun',
 'week_Thurs',
 'week_Tues',
 'week_Wed']

In [92]:
#Using the saved model

file_path = 'data.pickle'
# Open the file in binary mode
with open(file_path, 'rb') as file:
    model = pickle.load(file)
    
predictions = model.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
np.mean(errors)

3.9534782608695647