In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load data
file_path = r'C:/Users/mohsa/Downloads/ai/advanced surveillance/intern/weak7/final_internship_data.csv'
data = pd.read_csv(file_path)

# Handle missing values (dropping rows with missing fare_amount for simplicity)
data = data.dropna(subset=['fare_amount'])

# Encode categorical features (like Weather, CarCondition)
categorical_cols = ['Weather', 'CarCondition', 'Traffic Condition']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Feature and target selection
X = data[['Weather', 'CarCondition', 'Traffic Condition', 'distance', 'jfk_dist', 'ewr_dist', 'lga_dist', 'nyc_dist', 'bearing']]
y = data['fare_amount']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
# Remove rows with missing values from features
X_train = X_train.dropna()
y_train = y_train[X_train.index]  # Make sure to keep corresponding y values
X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Re-scale the data after removing missing values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
from sklearn.impute import SimpleImputer

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # You can use 'median', 'most_frequent', etc.
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Scale the data after imputing missing values
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 2.2210299701770326


In [13]:
import joblib

# Save the trained model and the scaler
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')


['label_encoders.pkl']