In [1]:
import pandas as pd

In [2]:
rain_data = pd.read_csv("all_data.csv", index_col=0)

In [3]:
rain_data.dtypes

date               object
time               object
station            object
value             float64
humidity          float64
temperature       float64
wind_direction    float64
wind_speed        float64
dtype: object

In [4]:
rain_data

Unnamed: 0,date,time,station,value,humidity,temperature,wind_direction,wind_speed
0,2023-02-03,00:05:00,S77,0.0,,,,
1,2023-02-03,00:10:00,S77,0.0,,,,
2,2023-02-03,00:15:00,S77,0.0,,,,
3,2023-02-03,00:20:00,S77,0.0,,,,
4,2023-02-03,00:25:00,S77,0.0,,,,
...,...,...,...,...,...,...,...,...
25797649,2020-06-09,23:35:00,S122,,,,,
25797650,2020-06-09,23:40:00,S122,,,,,
25797651,2020-06-09,23:45:00,S122,,,,,
25797652,2020-06-09,23:50:00,S122,,,,,


### Check the number of non NA values for the dataset, for each station

In [5]:
pd.set_option('display.max_rows', 90)

rain_data.groupby("station").count()

Unnamed: 0_level_0,date,time,value,humidity,temperature,wind_direction,wind_speed
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S07,317622,317622,96902,0,0,0,0
S08,317622,317622,316489,0,0,0,0
S100,321526,321526,313207,313224,313192,62090,62242
S102,321526,321526,30235,26149,28308,4992,5050
S104,321526,321526,306651,306005,306003,58984,59254
S105,317622,317622,2126,0,0,0,0
S106,321526,321526,298500,293516,293519,56517,56652
S107,321526,321526,313432,313821,313795,60142,60850
S108,321526,321526,296736,258179,296354,58309,58791
S109,321526,321526,303390,283231,303678,60075,60298


#### Dropping rows with no rain-fall reading at the particular timestamp

In [6]:
rain_data.dropna(axis=0,subset=["value"],inplace=True)
rain_data.groupby("station").count()

Unnamed: 0_level_0,date,time,value,humidity,temperature,wind_direction,wind_speed
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
S07,96902,96902,96902,0,0,0,0
S08,316489,316489,316489,0,0,0,0
S100,313207,313207,313207,310703,310671,61348,61497
S102,30235,30235,30235,26125,28284,1067,1077
S104,306651,306651,306651,303393,303391,56892,57156
S105,2126,2126,2126,0,0,0,0
S106,298500,298500,298500,291108,291111,53486,53609
S107,313432,313432,313432,311185,311159,58772,59469
S108,296736,296736,296736,255660,293737,54825,55263
S109,303390,303390,303390,280621,301068,57216,57421


## Covariance and Correlation of different columns

#### Including the rows without humidity, temperature, windspeed, wind direction

In [7]:
cov_matrix = pd.DataFrame.cov(rain_data)
print(cov_matrix)

  cov_matrix = pd.DataFrame.cov(rain_data)


                   value    humidity  temperature  wind_direction  wind_speed
value           0.086185    0.337633    -0.080860        0.626785   -0.000576
humidity        0.337633  133.938702   -20.483001       20.823417   -0.345789
temperature    -0.080860  -20.483001     4.685994        0.721350    0.278603
wind_direction  0.626785   20.823417     0.721350    13346.512069  -48.862658
wind_speed     -0.000576   -0.345789     0.278603      -48.862658    8.605431


In [8]:
corr_matrix = rain_data.corr()
print(corr_matrix)

  corr_matrix = rain_data.corr()


                   value  humidity  temperature  wind_direction  wind_speed
value           1.000000  0.109749    -0.133924        0.011120   -0.000401
humidity        0.109749  1.000000    -0.819775        0.017548   -0.011448
temperature    -0.133924 -0.819775     1.000000        0.003014    0.045875
wind_direction  0.011120  0.017548     0.003014        1.000000   -0.144276
wind_speed     -0.000401 -0.011448     0.045875       -0.144276    1.000000


#### Excluding the rows without humidity, temperature, windspeed, wind direction

In [9]:
stations_with_humidity = rain_data
stations_with_humidity.dropna(axis=0,how="any",inplace=True)

In [10]:
cov_matrix = pd.DataFrame.cov(stations_with_humidity)
print(cov_matrix)

                   value    humidity  temperature  wind_direction  wind_speed
value           0.215236    0.711270    -0.174531        0.461502   -0.005399
humidity        0.711270  105.235047   -17.336823       20.257653   -0.332593
temperature    -0.174531  -17.336823     4.310037        1.776926    0.312627
wind_direction  0.461502   20.257653     1.776926    13373.064016  -50.767260
wind_speed     -0.005399   -0.332593     0.312627      -50.767260    8.649832


  cov_matrix = pd.DataFrame.cov(stations_with_humidity)


In [11]:
corr_matrix = stations_with_humidity.corr()
print(corr_matrix)

                   value  humidity  temperature  wind_direction  wind_speed
value           1.000000  0.149450    -0.181207        0.008602   -0.003957
humidity        0.149450  1.000000    -0.814046        0.017076   -0.011024
temperature    -0.181207 -0.814046     1.000000        0.007401    0.051202
wind_direction  0.008602  0.017076     0.007401        1.000000   -0.149267
wind_speed     -0.003957 -0.011024     0.051202       -0.149267    1.000000


  corr_matrix = stations_with_humidity.corr()
