### Import the libraries 

In [25]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from datetime import timedelta

### Loading the data in a datafram.

In [26]:
data = pd.read_csv('dataset/weather-sa-2017-2019-clean.csv')
data.shape

(249023, 15)

In [27]:
data.head()

Unnamed: 0.1,Unnamed: 0,city,date,time,year,month,day,hour,minute,weather,temp,wind,humidity,barometer,visibility
0,0,Qassim,1 January 2017,00:00,2017,1,1,24,0,Clear,17,11,64%,1018.0,16
1,1,Qassim,1 January 2017,01:00,2017,1,1,1,0,Clear,17,6,64%,1018.0,16
2,2,Qassim,1 January 2017,03:00,2017,1,1,3,0,Clear,15,11,72%,1019.0,16
3,3,Qassim,1 January 2017,04:00,2017,1,1,4,0,Clear,15,11,72%,1019.0,16
4,4,Qassim,1 January 2017,05:00,2017,1,1,5,0,Clear,15,9,72%,1019.0,16


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249023 entries, 0 to 249022
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  249023 non-null  int64  
 1   city        249023 non-null  object 
 2   date        249023 non-null  object 
 3   time        249023 non-null  object 
 4   year        249023 non-null  int64  
 5   month       249023 non-null  int64  
 6   day         249023 non-null  int64  
 7   hour        249023 non-null  int64  
 8   minute      249023 non-null  int64  
 9   weather     249023 non-null  object 
 10  temp        249023 non-null  int64  
 11  wind        249023 non-null  int64  
 12  humidity    249006 non-null  object 
 13  barometer   248951 non-null  float64
 14  visibility  249023 non-null  int64  
dtypes: float64(1), int64(9), object(5)
memory usage: 28.5+ MB


It show that we have one float, 9 Integers and 5 objects data types.

In [5]:
### Convert the data types

In [29]:
data['date'] = pd.to_datetime(data["date"])
data['humidity'] = pd.to_numeric(data["humidity"].str.replace('%',' '), downcast="float") / 100

In [7]:
data


Unnamed: 0.1,Unnamed: 0,city,date,time,year,month,day,hour,minute,weather,temp,wind,humidity,barometer,visibility
0,0,Qassim,2017-01-01,00:00,2017,1,1,24,0,Clear,17,11,0.64,1018.0,16
1,1,Qassim,2017-01-01,01:00,2017,1,1,1,0,Clear,17,6,0.64,1018.0,16
2,2,Qassim,2017-01-01,03:00,2017,1,1,3,0,Clear,15,11,0.72,1019.0,16
3,3,Qassim,2017-01-01,04:00,2017,1,1,4,0,Clear,15,11,0.72,1019.0,16
4,4,Qassim,2017-01-01,05:00,2017,1,1,5,0,Clear,15,9,0.72,1019.0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249018,2848,Jawf,2019-04-30,19:00,2019,4,30,19,0,Passing clouds,32,19,0.14,1014.0,-1
249019,2849,Jawf,2019-04-30,20:00,2019,4,30,20,0,Passing clouds,29,9,0.22,1015.0,-1
249020,2850,Jawf,2019-04-30,21:00,2019,4,30,21,0,Passing clouds,27,7,0.24,1016.0,-1
249021,2851,Jawf,2019-04-30,22:00,2019,4,30,22,0,Clear,26,0,0.26,1017.0,16


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249023 entries, 0 to 249022
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   Unnamed: 0  249023 non-null  int64         
 1   city        249023 non-null  object        
 2   date        249023 non-null  datetime64[ns]
 3   time        249023 non-null  object        
 4   year        249023 non-null  int64         
 5   month       249023 non-null  int64         
 6   day         249023 non-null  int64         
 7   hour        249023 non-null  int64         
 8   minute      249023 non-null  int64         
 9   weather     249023 non-null  object        
 10  temp        249023 non-null  int64         
 11  wind        249023 non-null  int64         
 12  humidity    249006 non-null  float32       
 13  barometer   248951 non-null  float64       
 14  visibility  249023 non-null  int64         
dtypes: datetime64[ns](1), float32(1), float64(1), int64

### Drop Unnecessary columns

Unnamed column in the index of the data and we do not need it because datafram hundle that.

In [31]:
delete_col =['Unnamed','day','hour','minute']


### Dealing with missing values

In [32]:
data.isnull().sum()

Unnamed: 0     0
city           0
date           0
time           0
year           0
month          0
day            0
hour           0
minute         0
weather        0
temp           0
wind           0
humidity      17
barometer     72
visibility     0
dtype: int64

In [33]:
data[data['humidity'].isnull() == True]

Unnamed: 0.1,Unnamed: 0,city,date,time,year,month,day,hour,minute,weather,temp,wind,humidity,barometer,visibility
3385,3385,Qassim,2017-05-26,10:00,2017,5,26,10,0,Sunny,37,7,,,3
5270,5270,Qassim,2017-08-13,04:00,2017,8,13,4,0,Clear,34,4,,,16
6723,6723,Qassim,2017-10-13,16:00,2017,10,13,16,0,Sunny,38,9,,,16
23178,23178,Hail,2017-05-26,10:00,2017,5,26,10,0,Sunny,37,7,,,3
25063,25063,Hail,2017-08-13,04:00,2017,8,13,4,0,Clear,34,4,,,16
26516,26516,Hail,2017-10-13,16:00,2017,10,13,16,0,Sunny,38,9,,,16
70412,70412,EP,2018-05-10,14:00,2018,5,10,14,0,Clear,37,26,,1005.0,-1
72357,72357,EP,2018-08-04,19:00,2018,8,4,19,0,Clear,36,13,,999.0,16
72892,72892,EP,2018-08-28,04:00,2018,8,28,4,0,Clear,30,7,,997.0,16
74947,74947,EP,2018-11-26,07:00,2018,11,26,7,0,Sunny,16,19,,1017.0,16


In [11]:
data['city'].unique()

array(['Qassim', 'Hail', 'Madina', 'EP', 'Riyadh', 'Mecca', 'Tabuk',
       'Assir', 'Northern boarder', 'Jazan', 'Najran', 'Baha', 'Jawf'],
      dtype=object)

In [50]:
# means for humidity and barometer per city to be filled in the missing values.
data.groupby(by='city')[['humidity','barometer']].mean().reset_index()

Unnamed: 0,city,humidity,barometer
0,Assir,0.475902,1023.498147
1,Baha,0.38723,1019.012376
2,EP,0.450927,1008.609077
3,Hail,0.415361,1013.528736
4,Jawf,0.305004,1015.505283
5,Jazan,0.458031,1023.087145
6,Madina,0.199716,1014.141254
7,Mecca,0.541588,1008.713919
8,Najran,0.459536,1023.093818
9,Northern boarder,0.262638,1012.641971


In [51]:
# filling the missing values with means for each city
data['humidity'] = data['humidity'].fillna(data.groupby('city')['humidity'].transform('mean'))
data['barometer'] = data['barometer'].fillna(data.groupby('city')['barometer'].transform('mean'))



In [52]:
data.loc[3385]

Unnamed: 0                   3385
city                       Qassim
date          2017-05-26 00:00:00
time                        10:00
year                         2017
month                           5
day                            26
hour                           10
minute                          0
weather                    Sunny 
temp                           37
wind                            7
humidity                 0.414783
barometer             1013.489841
visibility                      3
Name: 3385, dtype: object

In [53]:
data[data['humidity'].isnull() == True]

Unnamed: 0.1,Unnamed: 0,city,date,time,year,month,day,hour,minute,weather,temp,wind,humidity,barometer,visibility


In [54]:
data.isnull().sum()

Unnamed: 0    0
city          0
date          0
time          0
year          0
month         0
day           0
hour          0
minute        0
weather       0
temp          0
wind          0
humidity      0
barometer     0
visibility    0
dtype: int64

In [18]:
#data['humidity'].fillna(data['humidity'].where(data['city'] == 'Qassim').mean()).where(data['city'] == 'Qassim')

## Weather data and create the target
#### data and imbalnce might need to downsample. we should not use teh 
### we should not use accurcy `.score` beacuse imbalnce data
### use recall or/and precision. combination called F1 scorse which is better.



In [180]:
data.groupby('weather')['Rain'].unique().sample(60)

weather
Heavy rain  More clouds than sun                [True]
Thundershowers  Scattered clouds                [True]
Thunderstorms  Overcast                         [True]
Refreshingly cool                              [False]
Sprinkles  Overcast                             [True]
Light rain  More clouds than sun                [True]
Sprinkles  Low level haze                       [True]
Thunderstorms  Scattered clouds                 [True]
Light rain  Partly sunny                        [True]
Heavy rain  Overcast                            [True]
Thunderstorms  Partly sunny                     [True]
Rain  Partly cloudy                             [True]
Drizzle  Fog                                   [False]
Thunderstorms  Partly cloudy                    [True]
Sunny                                          [False]
Drizzle  Overcast                              [False]
Heavy rain  Partly sunny                        [True]
Thunderstorms  Cloudy                           [True]
Th

In [111]:
data['Rain'].value_counts()

False    245654
True       3369
Name: Rain, dtype: int64

In [112]:
import re
data['Rain'] = data['weather'].str.contains('rain|shower|sprinkle|thunderstorm', case = False ,regex=True)


In [154]:

#data.loc[:,['weather','Rain']] 
#data.groupby(by='weather')['Rain'].sum().head(60)
data['weather'].unique()

array(['Clear ', 'Sunny ', 'Scattered clouds ', 'Partly sunny ',
       'Passing clouds ', 'Refreshingly cool ', 'Low level haze ',
       'Duststorm ', 'Thunderstorms  Passing clouds ', 'Fog ',
       'Thunderstorms  Partly sunny ', 'Light rain  Partly sunny ',
       'Dense fog ', 'Thunderstorms  Scattered clouds ',
       'Rain  Passing clouds ', 'Extremely hot ', 'Rain  Partly sunny ',
       'Pleasantly warm ', 'Hot ', 'Mild ', 'Overcast ',
       'Rain  Overcast ', 'Smoke ', 'Thunderstorms  Broken clouds ',
       'Heavy rain  Partly sunny ', 'Thunderstorms  Overcast ',
       'Light rain  Overcast ', 'Warm ', 'Thunderstorms  Cloudy ',
       'Drizzle  Overcast ', 'Thunderstorms  Partly cloudy ',
       'Broken clouds ', 'Sandstorm ', 'Partly cloudy ', 'Mostly cloudy ',
       'Rain  Partly cloudy ', 'Rain  Broken clouds ',
       'Rain  Scattered clouds ', 'Haze ', 'Rain  Mostly cloudy ',
       'Hail  Partly sunny ', 'Thundershowers  Passing clouds ',
       'Thunderstorms  Mor

In [107]:
data.[]

SyntaxError: invalid syntax (80440395.py, line 1)

In [None]:
#### 