In [6]:
import pandas as pd
df = pd.read_csv("sales.csv")
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640840 entries, 0 to 640839
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Unnamed: 0           640840 non-null  int64 
 1   store_ID             640840 non-null  int64 
 2   day_of_week          640840 non-null  int64 
 3   date                 640840 non-null  object
 4   nb_customers_on_day  640840 non-null  int64 
 5   open                 640840 non-null  int64 
 6   promotion            640840 non-null  int64 
 7   state_holiday        640840 non-null  object
 8   school_holiday       640840 non-null  int64 
 9   sales                640840 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 48.9+ MB


Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,nb_customers_on_day,open,promotion,school_holiday,sales
count,640840.0,640840.0,640840.0,640840.0,640840.0,640840.0,640840.0,640840.0
mean,355990.675084,558.211348,4.000189,633.398577,0.830185,0.381718,0.178472,5777.469011
std,205536.290268,321.878521,1.996478,464.094416,0.37547,0.485808,0.38291,3851.338083
min,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,178075.75,280.0,2.0,405.0,1.0,0.0,0.0,3731.0
50%,355948.5,558.0,4.0,609.0,1.0,0.0,0.0,5746.0
75%,533959.25,837.0,6.0,838.0,1.0,1.0,0.0,7860.0
max,712044.0,1115.0,7.0,5458.0,1.0,1.0,1.0,41551.0


In [7]:
df['date'] = pd.to_datetime(df['date'])

# Example feature engineering:
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['is_weekend'] = df['day_of_week'].isin([6, 7]).astype(int)

#df.describe()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640840 entries, 0 to 640839
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Unnamed: 0           640840 non-null  int64         
 1   store_ID             640840 non-null  int64         
 2   day_of_week          640840 non-null  int64         
 3   date                 640840 non-null  datetime64[ns]
 4   nb_customers_on_day  640840 non-null  int64         
 5   open                 640840 non-null  int64         
 6   promotion            640840 non-null  int64         
 7   state_holiday        640840 non-null  object        
 8   school_holiday       640840 non-null  int64         
 9   sales                640840 non-null  int64         
 10  month                640840 non-null  int32         
 11  year                 640840 non-null  int32         
 12  is_weekend           640840 non-null  int64         
dtypes: datetime64[

In [8]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['sales'])
y = df['sales']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:

# Drop 'date' from both sets because it's not directly useful as a raw feature in most machine learning models. 
X_train_prep = X_train.drop('date', axis=1)
X_val_prep = X_val.drop('date', axis=1)

# Sample 50,000 rows from the training set
X_sample = X_train_prep.sample(n=50000, random_state=42)
y_sample = y_train.loc[X_sample.index]


In [12]:
# One-hot encode the state_holiday column
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Create and fit encoder
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(X_sample[['state_holiday']])

# Transform training data
state_holiday_encoded = encoder.transform(X_sample[['state_holiday']])
state_holiday_cols = encoder.get_feature_names_out(['state_holiday'])
state_holiday_df = pd.DataFrame(state_holiday_encoded, columns=state_holiday_cols, index=X_sample.index)

# Drop original column and add encoded columns
X_sample_encoded = X_sample.drop('state_holiday', axis=1).join(state_holiday_df)

# Similarly transform validation data
val_state_holiday_encoded = encoder.transform(X_val_prep[['state_holiday']])
val_state_holiday_df = pd.DataFrame(
    val_state_holiday_encoded, 
    columns=state_holiday_cols, 
    index=X_val_prep.index
)
X_val_encoded = X_val_prep.drop('state_holiday', axis=1).join(val_state_holiday_df)

# Train model on the encoded sample
model = RandomForestRegressor(n_estimators=100, random_state=42)


In [13]:

# Train the model with sample data
model.fit(X_sample_encoded, y_sample)

# Predict using the encoded validation data
from sklearn.metrics import r2_score
y_pred = model.predict(X_val_encoded)
r2 = r2_score(y_val, y_pred)
print("Validation R² Score (sample-trained model):", r2)


feature_importance = pd.DataFrame({
    'Feature': X_sample_encoded.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nTop 10 most important features:")
print(feature_importance.head(10))

Validation R² Score (sample-trained model): 0.9000239256974296

Top 10 most important features:
                Feature  Importance
3   nb_customers_on_day    0.869005
1              store_ID    0.046771
0            Unnamed: 0    0.025064
5             promotion    0.022040
7                 month    0.015496
2           day_of_week    0.013198
8                  year    0.005170
6        school_holiday    0.002340
9            is_weekend    0.000776
10      state_holiday_0    0.000078


In [6]:
with open("r2-RF.txt", "w") as f:
    f.write(str(r2))
