In [2]:
import pandas as pd

In [3]:
df=pd.read_csv('weatherHistory.csv')

In [4]:
df.dtypes

Formatted Date               object
Summary                      object
Precip Type                  object
Temperature (C)             float64
Apparent Temperature (C)    float64
Humidity                    float64
Wind Speed (km/h)           float64
Wind Bearing (degrees)      float64
Visibility (km)             float64
Loud Cover                  float64
Pressure (millibars)        float64
Daily Summary                object
dtype: object

In [5]:
df['Summary'].value_counts()

Summary
Partly Cloudy                          31733
Mostly Cloudy                          28094
Overcast                               16597
Clear                                  10890
Foggy                                   7148
Breezy and Overcast                      528
Breezy and Mostly Cloudy                 516
Breezy and Partly Cloudy                 386
Dry and Partly Cloudy                     86
Windy and Partly Cloudy                   67
Light Rain                                63
Breezy                                    54
Windy and Overcast                        45
Humid and Mostly Cloudy                   40
Drizzle                                   39
Breezy and Foggy                          35
Windy and Mostly Cloudy                   35
Dry                                       34
Humid and Partly Cloudy                   17
Dry and Mostly Cloudy                     14
Rain                                      10
Windy                                      8
Hu

In [6]:
df = df.drop('Formatted Date', axis=1)

In [7]:
df.columns

Index(['Summary', 'Precip Type', 'Temperature (C)', 'Apparent Temperature (C)',
       'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)',
       'Visibility (km)', 'Loud Cover', 'Pressure (millibars)',
       'Daily Summary'],
      dtype='object')

In [8]:
df.isnull().sum() #checking for null values

Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64

In [9]:
df['Daily Summary'].value_counts()

Daily Summary
Mostly cloudy throughout the day.                                                                       20085
Partly cloudy throughout the day.                                                                        9981
Partly cloudy until night.                                                                               6169
Partly cloudy starting in the morning.                                                                   5184
Foggy in the morning.                                                                                    4201
                                                                                                        ...  
Breezy starting overnight continuing until morning and foggy overnight.                                    24
Mostly cloudy throughout the day and breezy starting overnight continuing until afternoon.                 24
Partly cloudy starting in the morning and breezy starting in the afternoon continuing until evening.      

In [10]:
#imputing precip type(categorical variable with mode)
df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Precip Type'].fillna(df['Precip Type'].mode()[0], inplace=True)


In [11]:
#one hot encoding on categorical variables (summary, precip type, daily summary)
df = pd.get_dummies(df, columns=['Summary', 'Precip Type', 'Daily Summary'], drop_first=True)

In [62]:
df.dtypes

Temperature (C)                             float64
Apparent Temperature (C)                    float64
Humidity                                    float64
Wind Speed (km/h)                           float64
Wind Bearing (degrees)                      float64
                                             ...   
Daily Summary_Partly cloudy until night.       bool
Daily Summary_Rain throughout the day.         bool
Daily Summary_Rain until afternoon.            bool
Daily Summary_Rain until morning.              bool
Daily Summary_Windy in the afternoon.          bool
Length: 248, dtype: object

In [12]:
import numpy as np

In [65]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

In [13]:
from sklearn.feature_selection import f_classif

In [14]:
# Reload the original target variable 'Summary'
original_data = pd.read_csv('weatherHistory.csv')
target = original_data['Summary'].loc[df.index]  # Align with current df after outlier removal

# Convert target to categorical numeric codes
y = target.astype('category').cat.codes

# Select only numerical features (including one-hot encoded ones)
X = df.select_dtypes(include=[np.number])

# Perform ANOVA F-test
f_values, p_values = f_classif(X, y)

# Create a DataFrame with results
anova_results = pd.DataFrame({
    'Feature': X.columns,
    'F-Value': f_values,
    'P-Value': p_values
}).sort_values(by='F-Value', ascending=False).reset_index(drop=True)

# ✅ Display all features
print(anova_results)


                    Feature      F-Value        P-Value
0           Visibility (km)  2460.876815   0.000000e+00
1         Wind Speed (km/h)  1257.932356   0.000000e+00
2                  Humidity  1042.587182   0.000000e+00
3           Temperature (C)   894.112610   0.000000e+00
4  Apparent Temperature (C)   852.577028   0.000000e+00
5      Pressure (millibars)   147.834422   0.000000e+00
6    Wind Bearing (degrees)    30.951449  3.343886e-152
7                Loud Cover          NaN            NaN


  f = msb / msw


In [15]:
# Keep only features with p-value ≤ 0.05
significant_features = anova_results[anova_results['P-Value'] <= 0.05]['Feature'].tolist()

# Filter the original DataFrame to keep only those features
df = df[significant_features]

# Now, df contains only statistically significant features
print(f"Selected {len(significant_features)} features with p-value ≤ 0.05.")


Selected 7 features with p-value ≤ 0.05.


In [20]:
from collections import Counter

# Count class instances
class_counts = Counter(y)
# Keep only classes with at least 6 samples
valid_classes = [cls for cls, count in class_counts.items() if count >= 6]

# Filter X and y
valid_indices = [i for i, label in enumerate(y) if label in valid_classes]
X_filtered = X.iloc[valid_indices]
y_filtered = y[valid_indices]

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_filtered, y_filtered)

# Decode labels if needed
y_resampled_labels = le.inverse_transform(y_resampled)

print("Before SMOTE:", Counter(y_filtered))
print("After SMOTE:", Counter(y_resampled))

Before SMOTE: Counter({19: 31733, 17: 28094, 18: 16597, 6: 10890, 12: 7148, 4: 528, 3: 516, 5: 386, 11: 86, 26: 67, 16: 63, 0: 54, 25: 45, 13: 40, 8: 39, 2: 35, 24: 35, 9: 34, 15: 17, 10: 14, 20: 10, 21: 8, 14: 7})
After SMOTE: Counter({19: 31733, 17: 31733, 18: 31733, 12: 31733, 3: 31733, 6: 31733, 5: 31733, 4: 31733, 13: 31733, 15: 31733, 25: 31733, 2: 31733, 26: 31733, 0: 31733, 11: 31733, 24: 31733, 9: 31733, 21: 31733, 14: 31733, 16: 31733, 8: 31733, 10: 31733, 20: 31733})


In [None]:
import pandas as pd

# Assuming X_resampled and y_resampled_labels are already defined from SMOTE step
# Combine the features and target back into a single DataFrame
df_resampled = X_resampled.copy()
df_resampled['Summary'] = y_resampled_labels  # Add the target column back

# Save to CSV
output_path = "preprocessed_weather_data.csv"
df_resampled.to_csv(output_path, index=False)

'preprocessed_weather_data.csv'