In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import joblib
import os

# Define the uploads directory
uploads_dir = 'uploads'

# Find the first CSV file in the uploads directory
uploaded_file_name = None
for file in os.listdir(uploads_dir):
    if file.endswith('.csv'):
        uploaded_file_name = file
        break  # Stop after finding the first CSV file

if uploaded_file_name is None:
    print("No CSV file found in the uploads directory.")
else:
    # Load the uploaded CSV file
    uploaded_file_path = os.path.join(uploads_dir, uploaded_file_name)
    try:
        new_data_df = pd.read_csv(uploaded_file_path)
    except Exception as e:
        print(f"Error reading {uploaded_file_path}: {e}")
        new_data_df = pd.DataFrame()  # Create an empty DataFrame if there is an error

    # Load the existing Raw_Data.csv file
    raw_data_path = 'Raw_Data.csv'
    if os.path.exists(raw_data_path):
        raw_data_df = pd.read_csv(raw_data_path)
    else:
        print("Raw_Data.csv not found. Creating a new one.")
        raw_data_df = pd.DataFrame()  # Create an empty DataFrame if Raw_Data.csv doesn't exist

    # Merge the two DataFrames
    combined_df = pd.concat([raw_data_df, new_data_df], ignore_index=True)

    # Save the combined DataFrame as Raw_Data.csv (replacing it)
    combined_df.to_csv(raw_data_path, index=False)

    # Fill missing values with column mean
    for col in combined_df.select_dtypes(include=[np.number]).columns:
        mean_value = combined_df[col].mean()
        combined_df[col].fillna(mean_value, inplace=True)

    # Drop duplicate records
    combined_df = combined_df.drop_duplicates(keep='first')

    # Outlier Detection and Treatment using IQR
    def remove_outliers(df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

    # Removing outliers for all numerical columns
    for col in combined_df.select_dtypes(include=[np.number]).columns:
        remove_outliers(combined_df, col)

    # Save the processed data for visualization and model building
    combined_df.to_csv('Cleaned_Data.csv', index=False)

    # Splitting Data into Features and Target
    X = combined_df.drop('price_range', axis=1)  # Ensure 'price_range' is the target variable
    y = combined_df['price_range']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Evaluate models
    models = {
        "Random Forest": RandomForestClassifier(),
        "Bagging": BaggingClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "SVC": SVC()
    }

    best_model = None
    best_score = 0

    for name, model in models.items():
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        print(f"{name} Test Score: {score:.4f}")
        if score > best_score:
            best_score = score
            best_model = model

    # Export the best model
    joblib.dump(best_model, 'model.pkl')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df[col].fillna(mean_value, inplace=True)


Random Forest Test Score: 0.8731
Bagging Test Score: 0.8682


Gradient Boosting Test Score: 0.8806
SVC Test Score: 0.9453


In [2]:
# # Generate random test data
# new_data = {
#     'battery_power': [850],
#     'blue': [1],
#     'clock_speed': [2.0],
#     'dual_sim': [1],
#     'fc': [2.0],
#     'four_g': [1],
#     'int_memory': [16],
#     'm_dep': [0.6],
#     'mobile_wt': [150],
#     'n_cores': [8],
#     'pc': [2],
#     'px_height': [720],
#     'px_width': [1280],
#     'ram': [2048],
#     'sc_h': [6.0],
#     'sc_w': [3.0],
#     'talk_time': [20],
#     'three_g': [1],
#     'touch_screen': [1],
#     'wifi': [1]
# }

# new_df = pd.DataFrame(new_data)

# model = joblib.load('model.pkl')
# price_range_prediction = model.predict(new_df).astype(int)

# print("Predicted Price Range:", price_range_prediction[0])