In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller

In [None]:
df = pd.read_csv("weatherHistory.csv", parse_dates=["Formatted Date"])

In [6]:
df.columns

Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary'],
      dtype='object')

In [7]:
df.isnull().sum()

Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64

In [8]:
#imputing precip type(categorical variable with mode)
df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)


In [9]:
df.dtypes

Formatted Date               object
Summary                      object
Precip Type                  object
Temperature (C)             float64
Apparent Temperature (C)    float64
Humidity                    float64
Wind Speed (km/h)           float64
Wind Bearing (degrees)      float64
Visibility (km)             float64
Loud Cover                  float64
Pressure (millibars)        float64
Daily Summary                object
dtype: object

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
categorical_cols = df.select_dtypes(include=['object']).columns

In [12]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [13]:
df.dtypes

Formatted Date                int32
Summary                       int32
Precip Type                   int32
Temperature (C)             float64
Apparent Temperature (C)    float64
Humidity                    float64
Wind Speed (km/h)           float64
Wind Bearing (degrees)      float64
Visibility (km)             float64
Loud Cover                  float64
Pressure (millibars)        float64
Daily Summary                 int32
dtype: object

In [14]:
import numpy as np

In [16]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

In [17]:
from sklearn.feature_selection import f_classif

In [18]:
# Reload the original target variable 'Summary'
original_data = pd.read_csv('weatherHistory.csv')
target = original_data['Summary'].loc[df.index]  # Align with current df after outlier removal

# Convert target to categorical numeric codes
y = target.astype('category').cat.codes

# Select only numerical features (including one-hot encoded ones)
X = df.select_dtypes(include=[np.number])

# Perform ANOVA F-test
f_values, p_values = f_classif(X, y)

# Create a DataFrame with results
anova_results = pd.DataFrame({
    'Feature': X.columns,
    'F-Value': f_values,
    'P-Value': p_values
}).sort_values(by='F-Value', ascending=False).reset_index(drop=True)

# ✅ Display all features
print(anova_results)


                     Feature      F-Value        P-Value
0                    Summary          inf   0.000000e+00
1              Daily Summary  2583.089803   0.000000e+00
2                   Humidity  1705.487466   0.000000e+00
3   Apparent Temperature (C)  1476.430827   0.000000e+00
4            Temperature (C)  1472.541800   0.000000e+00
5            Visibility (km)   721.098984   0.000000e+00
6       Pressure (millibars)   258.638072   0.000000e+00
7          Wind Speed (km/h)   200.371902  3.207460e-254
8             Formatted Date    51.530997   1.245139e-63
9     Wind Bearing (degrees)     8.527668   2.766083e-09
10               Precip Type          NaN            NaN
11                Loud Cover          NaN            NaN


  f = msb / msw
  f = msb / msw


In [19]:
# Keep only features with p-value ≤ 0.05
significant_features = anova_results[anova_results['P-Value'] <= 0.05]['Feature'].tolist()

# Filter the original DataFrame to keep only those features
df = df[significant_features]

# Now, df contains only statistically significant features
print(f"Selected {len(significant_features)} features with p-value ≤ 0.05.")

Selected 10 features with p-value ≤ 0.05.


In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler = StandardScaler()

# Apply Standard scaling to the 'Temperature (C)' column
df['Temperature (C)_standardized'] = scaler.fit_transform(df[['Temperature (C)']])

# Print the DataFrame with the standardized temperature values
print("Data with Standardization (Z-score scaling):")
print(df)

Data with Standardization (Z-score scaling):
       Summary  Daily Summary  Humidity  Apparent Temperature (C)  \
0           19            197      0.89                  7.388889   
1           19            197      0.86                  7.227778   
2           17            197      0.89                  9.377778   
3           19            197      0.83                  5.944444   
4           17            197      0.83                  6.977778   
...        ...            ...       ...                       ...   
96448       19            170      0.43                 26.016667   
96449       19            170      0.48                 24.583333   
96450       19            170      0.56                 22.038889   
96451       19            170      0.60                 21.522222   
96452       19            170      0.61                 20.438889   

       Temperature (C)  Visibility (km)  Pressure (millibars)  \
0             9.472222          15.8263               1015.13

In [22]:
# Save to CSV
output_path = "preprocessed_weather_data.csv"
df.to_csv(output_path, index=False)