In [6]:
# 1. Data Loading and Exploration

# 1. Load the dataset
from google.colab import files
import pandas as pd

uploaded = files.upload()  # Upload file manually
file_name = 'train.csv'    # Replace with your filename if different

try:
    df = pd.read_csv(file_name)
    print("File loaded successfully.")
except Exception as e:
    print(f"Error loading file: {e}")

# 2. Explore initial structure
print("\n--- DataFrame Info ---")
print(df.info())

print("\n--- First 5 Rows ---")
print(df.head())

# 3. Summarize features and target variable
features = df.columns[:-1]  # All columns except the last
target = df.columns[-1]     # Last column is the target

print(f"\nNumber of features: {len(features)}")
print("Features:", features.tolist())
print("Target variable:", target)

# 4. Check data types
print("\n--- Data Types Summary ---")
print(df.dtypes.value_counts())

# Optional: List binary features (0/1 values)
binary_features = [col for col in df.columns if df[col].nunique() == 2]
print(f"\nBinary features ({len(binary_features)}): {binary_features}")

# 5. Basic descriptive statistics
print("\n--- Descriptive Statistics ---")
print(df.describe())


Saving train.csv to train (1).csv
File loaded successfully.

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time   

In [8]:
#  2. Data Cleaning and Preprocessing
# Check for missing/null values
print("--- Missing Values in Each Column ---")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values found.")

# Optional: drop rows/columns with missing values if any
# df = df.dropna()  # or df.fillna(method='ffill'), etc.

# Check for categorical data types
print("\n--- Data Types Overview ---")
print(df.dtypes)

# In this dataset, all columns are already numeric (int or float), including the target `price_range`
# But if there were categorical columns (dtype = object), we would convert them like this:
categorical_cols = df.select_dtypes(include='object').columns.tolist()

if categorical_cols:
    print("\n--- Transforming Categorical Columns ---")
    print("Categorical columns:", categorical_cols)
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    print("One-hot encoding applied.")
else:
    print("No categorical columns found. All features are already numerical.")

# Final check
print("\n--- DataFrame Shape After Cleaning ---")
print(df.shape)

--- Missing Values in Each Column ---
No missing values found.

--- Data Types Overview ---
battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object
No categorical columns found. All features are already numerical.

--- DataFrame Shape After Cleaning ---
(2000, 21)
