In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [38]:
print("Loading data and model...")
train_df = pd.read_csv("Train_data.csv")
test_df = pd.read_csv("Test_data.csv")
model = joblib.load("random_forest_rain_prediction_model.pkl")

Loading data and model...


In [39]:
test_ids = test_df['Unnamed: 0'] if 'Unnamed: 0' in test_df.columns else test_df.index

In [40]:
if "Unnamed: 0" in train_df.columns:
    train_df.drop(["Unnamed: 0"], axis=1, inplace=True)
if "Unnamed: 0" in test_df.columns:
    test_df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [41]:
X_train_raw = train_df.drop(columns=["RainTomorrow"])
X_test_raw = test_df.copy()

In [42]:
categorical_features = X_train_raw.select_dtypes(include=["object"]).columns.tolist()
numerical_features = X_train_raw.select_dtypes(include=["number"]).columns.tolist()

In [43]:
def preprocess_data(df, is_train=True):
    df = df.copy()
    
    # Impute Categorical (Mode)
    for col in categorical_features:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df.groupby("Location")[col].transform(lambda s: s.mode()[0] if not s.mode().empty else None))
    
    for col in categorical_features:
        if df[col].isnull().any():
            df[col].fillna(df[col].mode()[0], inplace=True)
    
            
    # Impute Numerical (Mean)
    for col in numerical_features:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df.groupby("Location")[col].transform(lambda s: s.mean() if not s.mean() else None))

    for col in numerical_features:
        if df[col].isnull().any():
            df[col].fillna(df[col].mean(), inplace=True)    

    # Map RainToday
    if "RainToday" in df.columns:
        df['RainToday'] = df['RainToday'].map({'Yes': 1, 'No': 0}).fillna(0)
        
    return df

In [44]:
X_train_clean = preprocess_data(X_train_raw)
X_test_clean = preprocess_data(X_test_raw)

In [45]:
train_objs_num = len(X_train_clean)
dataset = pd.concat(objs=[X_train_clean, X_test_clean], axis=0)
dataset = pd.get_dummies(dataset, columns=categorical_features, drop_first=True)

In [46]:
X_train_encoded = dataset[:train_objs_num]
X_test_encoded = dataset[train_objs_num:]

In [47]:
scaler = StandardScaler()
X_train_encoded[numerical_features] = scaler.fit_transform(X_train_encoded[numerical_features])
X_test_encoded[numerical_features] = scaler.transform(X_test_encoded[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_encoded[numerical_features] = scaler.fit_transform(X_train_encoded[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_encoded[numerical_features] = scaler.transform(X_test_encoded[numerical_features])


In [48]:
pca = PCA(n_components=34) # The model expects exactly 34 components
X_train_pca = pca.fit_transform(X_train_encoded)
X_test_pca = pca.transform(X_test_encoded)

In [49]:
X_test_final = pd.DataFrame(X_test_pca, index=X_test_encoded.index)

In [50]:
predictions = model.predict(X_test_final)

In [51]:
output = pd.DataFrame({'id': test_ids, 'RainTomorrow': predictions})
output = output.sort_values(by='id').reset_index(drop=True).drop(["id"], axis=1)
output.to_csv("submission.csv", index=False)