# Data Analysis Notebook

This notebook contains analysis for the cal_consumption dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Display basic information
print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)

In [None]:
# Examine the first few rows of the training data
train_df.head()

In [None]:
# Display summary statistics
train_df.describe()

In [None]:
# Check for missing values
print("Missing values in training data:")
print(train_df.isnull().sum())

print("\nMissing values in test data:")
print(test_df.isnull().sum())

## Exploratory Data Analysis

In [None]:
# Explore the distribution of numerical features
# Update column names based on your dataset
num_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create histograms for numerical features
fig, axes = plt.subplots(len(num_features), 1, figsize=(12, 4 * len(num_features)))
for i, feature in enumerate(num_features):
    sns.histplot(train_df[feature], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [ ]:
# Correlation analysis
corr_matrix = train_df.select_dtypes(include=['int64', 'float64']).corr()

# Handle case where there might be only one or no numeric columns
if corr_matrix.shape[0] > 0:
    plt.figure(figsize=(14, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Create mask for upper triangle
    
    # Check if corr_matrix is larger than 1x1 (needed for mask to work properly)
    if corr_matrix.shape[0] > 1:
        heatmap = sns.heatmap(
            corr_matrix, 
            annot=True,
            mask=mask,
            cmap='coolwarm', 
            fmt='.2f',
            linewidths=0.5
        )
    else:
        # For a 1x1 matrix, don't use mask
        heatmap = sns.heatmap(
            corr_matrix, 
            annot=True,
            cmap='coolwarm', 
            fmt='.2f',
            linewidths=0.5
        )
    
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
else:
    print("No numeric columns available for correlation analysis.")

## Feature Engineering

In [None]:
# Add your feature engineering code here
# Example:
# train_df['new_feature'] = train_df['feature1'] / train_df['feature2']


## Model Building

In [None]:
# Import ML libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Prepare data for modeling
# Assuming the last column is the target variable
# Update accordingly based on your dataset

# X = train_df.drop('target_column', axis=1)
# y = train_df['target_column']

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature scaling
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)

In [None]:
# Train a linear regression model
# lr_model = LinearRegression()
# lr_model.fit(X_train_scaled, y_train)

# lr_pred = lr_model.predict(X_val_scaled)
# print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_val, lr_pred)))
# print("Linear Regression R²:", r2_score(y_val, lr_pred))

In [None]:
# Train a random forest model
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# rf_pred = rf_model.predict(X_val)
# print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_val, rf_pred)))
# print("Random Forest R²:", r2_score(y_val, rf_pred))

## Model Evaluation

In [None]:
# Feature importance (for Random Forest)
# feature_importance = pd.DataFrame()
# feature_importance['Feature'] = X.columns
# feature_importance['Importance'] = rf_model.feature_importances_
# feature_importance.sort_values(by='Importance', ascending=False, inplace=True)

# plt.figure(figsize=(12, 8))
# sns.barplot(x='Importance', y='Feature', data=feature_importance)
# plt.title('Feature Importance')
# plt.show()

## Predictions on Test Data

In [None]:
# Make predictions on test data
# X_test = test_df.drop('id', axis=1)  # Replace 'id' with any non-feature column
# X_test_scaled = scaler.transform(X_test)

# test_predictions = rf_model.predict(X_test)
# submission = pd.DataFrame({
#     'id': test_df['id'],  # Replace with actual ID column name
#     'prediction': test_predictions
# })

# submission.to_csv('submission.csv', index=False)
# submission.head()