In [1]:
# Data handling
import pandas as pd
import numpy as np

# Modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Load train and test datasets
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

# Quick look
print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()


Train shape: (18154, 21)
Test shape: (4323, 20)


Unnamed: 0,house_id,sale_date,num_bedrooms,num_bathrooms,living_area,lot_area,num_floors,is_waterfront,view_rating,condition_index,...,above_area,basement_area,built_year,renovated_year,zip_area,latitude,longitude,neighbor_living_area,neighbor_lot_area,target_price
0,8902000050,20141027T000000,3,1.75,1720.0,7200.0,1.0,0,0.0,3,...,1420.0,300.0,1959,0.0,98125,47.7062,-122.304,1380,8000,622200.0
1,4325700085,20150325T000000,3,1.0,1310.0,8514.0,1.0,0,0.0,4,...,1310.0,0.0,1953,0.0,98133,47.7502,-122.353,1310,8514,417000.0
2,7732410420,20140617T000000,3,2.5,2590.0,7720.0,2.0,0,0.0,3,...,2590.0,0.0,1988,0.0,98007,47.659,-122.146,2600,9490,809000.0
3,4039701280,20150408T000000,3,2.25,2440.0,9689.0,1.0,0,2.0,4,...,1830.0,610.0,1974,0.0,98008,47.6141,-122.111,2730,9689,954500.0
4,5379805120,20150424T000000,2,1.0,740.0,7380.0,1.0,0,0.0,4,...,740.0,0.0,1951,0.0,98188,47.4481,-122.278,1500,10075,213000.0


In [3]:
train.isnull().sum()
test.isnull().sum()



Unnamed: 0,0
house_id,0
sale_date,0
num_bedrooms,0
num_bathrooms,0
living_area,0
lot_area,0
num_floors,0
is_waterfront,0
view_rating,0
condition_index,0


In [4]:
# Check for missing values
print(train.isnull().sum())

# Check data types
print(train.dtypes)

# Quick statistics
train.describe()


house_id                   0
sale_date                  0
num_bedrooms               0
num_bathrooms           1452
living_area                0
lot_area                   0
num_floors                 0
is_waterfront              0
view_rating             1452
condition_index            0
construction_grade         0
above_area                 0
basement_area           1452
built_year                 0
renovated_year          1452
zip_area                   0
latitude                   0
longitude                  0
neighbor_living_area       0
neighbor_lot_area          0
target_price               0
dtype: int64
house_id                  int64
sale_date                object
num_bedrooms              int64
num_bathrooms           float64
living_area             float64
lot_area                float64
num_floors              float64
is_waterfront             int64
view_rating             float64
condition_index          object
construction_grade       object
above_area              fl

Unnamed: 0,house_id,num_bedrooms,num_bathrooms,living_area,lot_area,num_floors,is_waterfront,view_rating,above_area,basement_area,built_year,latitude,longitude,neighbor_living_area,neighbor_lot_area,target_price
count,18154.0,18154.0,16702.0,18154.0,18154.0,18154.0,18154.0,16702.0,18154.0,16702.0,18154.0,18154.0,18154.0,18154.0,18154.0,18154.0
mean,4577761000.0,3.365374,2.109672,2293.149144,16902.24,1.499559,0.007051,0.228895,1957.000948,501.226713,1971.204142,47.560339,-122.21374,1983.376942,12772.240167,559286.5
std,2879825000.0,0.929909,0.766913,14839.652558,153186.4,0.542244,0.083675,0.754919,11546.936467,13757.364954,29.42795,0.138367,0.140696,681.194066,27515.681337,1520323.0
min,1000102.0,-1.0,-1.0,-100.0,-130872.6,1.0,0.0,0.0,290.0,0.0,1900.0,47.1593,-122.519,399.0,651.0,-50000.0
25%,2122052000.0,3.0,1.5,1428.943389,5000.0,1.0,0.0,0.0,1200.0,0.0,1952.0,47.4723,-122.328,1480.0,5082.25,320000.0
50%,3904945000.0,3.0,2.25,1910.0,7572.5,1.5,0.0,0.0,1569.094762,0.0,1975.0,47.5717,-122.23,1840.0,7620.0,450000.0
75%,7308900000.0,4.0,2.5,2540.0,10751.5,2.0,0.0,0.0,2200.0,550.0,1997.0,47.677875,-122.124,2360.0,10063.75,640000.0
max,9900000000.0,33.0,8.0,999999.0,9999999.0,3.5,1.0,4.0,777777.0,888888.0,2015.0,47.78279,-121.315,6210.0,871200.0,100000000.0


In [5]:
train['num_bathrooms'].fillna(train['num_bathrooms'].median(), inplace=True)
train['view_rating'].fillna(0, inplace=True)  # assuming 0 = no view
train['basement_area'].fillna(0, inplace=True)
train['renovated_year'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['num_bathrooms'].fillna(train['num_bathrooms'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['view_rating'].fillna(0, inplace=True)  # assuming 0 = no view
The behavior will change in pandas 3.0. This inplace method will never work because th

In [6]:
train['num_bathrooms'] = train['num_bathrooms'].fillna(train['num_bathrooms'].median())
train['view_rating'] = train['view_rating'].fillna(0)  # assuming 0 = no view
train['basement_area'] = train['basement_area'].fillna(0)
train['renovated_year'] = train['renovated_year'].fillna(0)


In [7]:
# One-hot encode categorical columns
train = pd.get_dummies(train, columns=['condition_index', 'zip_area'], drop_first=True)


In [8]:
from sklearn.preprocessing import StandardScaler

# Select numeric columns to scale (excluding target)
num_cols = ['num_bedrooms', 'num_bathrooms', 'living_area', 'lot_area',
            'num_floors', 'above_area', 'basement_area', 'built_year',
            'renovated_year', 'latitude', 'longitude',
            'neighbor_living_area', 'neighbor_lot_area']

scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])


ValueError: could not convert string to float: "'0.0'"

In [9]:
# Convert numeric columns to proper numeric type
for col in num_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce')  # non-numeric becomes NaN

# Fill any NaNs that might appear after conversion
train[num_cols] = train[num_cols].fillna(0)

# Now scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])


In [10]:
# Split data into features and target
X = train.drop(['house_id', 'sale_date', 'target_price'], axis=1)
y = train['target_price']

# Train-test split for evaluation
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Import and train a regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate using metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)


ValueError: could not convert string to float: "'7'"

In [11]:
# Convert these object columns to numeric if they are numeric but stored as string
object_numeric_cols = ['condition_index', 'renovated_year']
for col in object_numeric_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce')  # non-numeric becomes NaN
    train[col] = train[col].fillna(0)


KeyError: 'condition_index'

In [12]:
print(train.columns.tolist())


['house_id', 'sale_date', 'num_bedrooms', 'num_bathrooms', 'living_area', 'lot_area', 'num_floors', 'is_waterfront', 'view_rating', 'construction_grade', 'above_area', 'basement_area', 'built_year', 'renovated_year', 'latitude', 'longitude', 'neighbor_living_area', 'neighbor_lot_area', 'target_price', 'condition_index_ 4 ', 'condition_index_ 5 ', "condition_index_'3'", "condition_index_'4'", "condition_index_'5'", 'condition_index_1', 'condition_index_2', 'condition_index_2.0', 'condition_index_3', 'condition_index_3.0', 'condition_index_4', 'condition_index_4.0', 'condition_index_5', 'condition_index_5.0', 'zip_area_ 98003 ', 'zip_area_ 98005 ', 'zip_area_ 98006 ', 'zip_area_ 98007 ', 'zip_area_ 98008 ', 'zip_area_ 98010 ', 'zip_area_ 98014 ', 'zip_area_ 98022 ', 'zip_area_ 98027 ', 'zip_area_ 98028 ', 'zip_area_ 98029 ', 'zip_area_ 98030 ', 'zip_area_ 98031 ', 'zip_area_ 98032 ', 'zip_area_ 98033 ', 'zip_area_ 98034 ', 'zip_area_ 98038 ', 'zip_area_ 98039 ', 'zip_area_ 98040 ', 'zip_

In [13]:
# Drop non-feature columns
X = train.drop(columns=['house_id', 'sale_date', 'target_price'])
y = train['target_price']


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


ValueError: could not convert string to float: "'7'"

In [16]:
X_train.dtypes[X_train.dtypes == 'object']


Unnamed: 0,0
construction_grade,object


In [17]:
for col in X_train.columns:
    if X_train[col].dtype == 'object':

        X_train[col] = X_train[col].str.replace("'", "").str.strip()
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce')


In [18]:
X_train = X_train.fillna(0)


In [19]:
for col in X_val.columns:
    if X_val[col].dtype == 'object':
        X_val[col] = X_val[col].str.replace("'", "").str.strip()
        X_val[col] = pd.to_numeric(X_val[col], errors='coerce')

X_val = X_val.fillna(0)


In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize the model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)


In [21]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)


RMSE: 1662295.0939796832
MAE: 156197.0011385381
R2 Score: 0.024446834412778173


In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

X = train[num_cols]
y = train['target_price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)


RMSE: 1702396.5808445206
MAE: 121334.58818653105
R2 Score: -0.023189731433859384


In [23]:
# Cap extreme values to a reasonable range (e.g., 1–99 percentile)
for col in ['living_area','lot_area','basement_area','above_area']:
    upper = train[col].quantile(0.99)
    lower = train[col].quantile(0.01)
    train[col] = train[col].clip(lower, upper)


In [24]:
cat_cols = ['condition_index','zip_area']  # adjust based on your dataset
train = pd.get_dummies(train, columns=cat_cols, drop_first=True)


KeyError: "None of [Index(['condition_index', 'zip_area'], dtype='object')] are in the [columns]"

In [25]:
# Check your columns
print(train.columns.tolist())


['house_id', 'sale_date', 'num_bedrooms', 'num_bathrooms', 'living_area', 'lot_area', 'num_floors', 'is_waterfront', 'view_rating', 'construction_grade', 'above_area', 'basement_area', 'built_year', 'renovated_year', 'latitude', 'longitude', 'neighbor_living_area', 'neighbor_lot_area', 'target_price', 'condition_index_ 4 ', 'condition_index_ 5 ', "condition_index_'3'", "condition_index_'4'", "condition_index_'5'", 'condition_index_1', 'condition_index_2', 'condition_index_2.0', 'condition_index_3', 'condition_index_3.0', 'condition_index_4', 'condition_index_4.0', 'condition_index_5', 'condition_index_5.0', 'zip_area_ 98003 ', 'zip_area_ 98005 ', 'zip_area_ 98006 ', 'zip_area_ 98007 ', 'zip_area_ 98008 ', 'zip_area_ 98010 ', 'zip_area_ 98014 ', 'zip_area_ 98022 ', 'zip_area_ 98027 ', 'zip_area_ 98028 ', 'zip_area_ 98029 ', 'zip_area_ 98030 ', 'zip_area_ 98031 ', 'zip_area_ 98032 ', 'zip_area_ 98033 ', 'zip_area_ 98034 ', 'zip_area_ 98038 ', 'zip_area_ 98039 ', 'zip_area_ 98040 ', 'zip_

In [26]:
# Remove spaces and quotes from column names
train.columns = train.columns.str.strip()           # remove leading/trailing spaces
train.columns = train.columns.str.replace("'", "")  # remove quotes
train.columns = train.columns.str.replace(" ", "_") # replace spaces with underscores


In [27]:
print(train.columns.tolist())


['house_id', 'sale_date', 'num_bedrooms', 'num_bathrooms', 'living_area', 'lot_area', 'num_floors', 'is_waterfront', 'view_rating', 'construction_grade', 'above_area', 'basement_area', 'built_year', 'renovated_year', 'latitude', 'longitude', 'neighbor_living_area', 'neighbor_lot_area', 'target_price', 'condition_index__4', 'condition_index__5', 'condition_index_3', 'condition_index_4', 'condition_index_5', 'condition_index_1', 'condition_index_2', 'condition_index_2.0', 'condition_index_3', 'condition_index_3.0', 'condition_index_4', 'condition_index_4.0', 'condition_index_5', 'condition_index_5.0', 'zip_area__98003', 'zip_area__98005', 'zip_area__98006', 'zip_area__98007', 'zip_area__98008', 'zip_area__98010', 'zip_area__98014', 'zip_area__98022', 'zip_area__98027', 'zip_area__98028', 'zip_area__98029', 'zip_area__98030', 'zip_area__98031', 'zip_area__98032', 'zip_area__98033', 'zip_area__98034', 'zip_area__98038', 'zip_area__98039', 'zip_area__98040', 'zip_area__98042', 'zip_area__98

In [28]:
cat_cols = [col for col in train.columns if col.startswith('condition_index') or col.startswith('zip_area')]


In [29]:
# If you haven't already, drop the first of each category
train = pd.get_dummies(train, columns=cat_cols, drop_first=True)


In [30]:
X = train.drop(['house_id', 'sale_date', 'target_price'], axis=1)
y = train['target_price']


In [31]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)


ValueError: could not convert string to float: "'7'"

In [33]:
num_cols = [
    'num_bedrooms','num_bathrooms','living_area','lot_area','num_floors',
    'is_waterfront','view_rating','construction_grade','above_area','basement_area',
    'built_year','renovated_year','latitude','longitude','neighbor_living_area','neighbor_lot_area'
]

for col in num_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce')  # convert strings to numbers
train[num_cols] = train[num_cols].fillna(0)  # replace NaNs with 0


In [34]:
X = train.drop(['house_id', 'sale_date', 'target_price'], axis=1)
y = train['target_price']


In [35]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)


RMSE: 1660943.1936138533
MAE: 153913.0682063157
R2 Score: 0.026032972028339096


In [47]:
final_model = LinearRegression()
final_model.fit(X, y)  # Use all available data


In [48]:
y_test_pred = final_model.predict(X_test)  # X_test = your test features


NameError: name 'X_test' is not defined

In [49]:
# Target variable
y = train['target_price']

# Features (drop target and any non-numeric IDs)
X = train.drop(['target_price', 'house_id', 'sale_date'], axis=1)


In [50]:
cat_cols = ['condition_index', 'zip_area']  # adjust if needed

# One-hot encoding
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)


KeyError: "None of [Index(['condition_index', 'zip_area'], dtype='object')] are in the [columns]"

In [51]:
# Drop non-feature columns
X = train.drop(['house_id', 'sale_date', 'target_price'], axis=1)
y = train['target_price']

X_test = test.drop(['house_id', 'sale_date'], axis=1)


In [52]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R2 Score:", r2)


RMSE: 1660943.1936138533
MAE: 153913.0682063157
R2 Score: 0.026032972028339096


In [53]:
y_test_pred = model.predict(X_test)

submission = pd.DataFrame({
    'house_id': test['house_id'],
    'predicted_price': y_test_pred
})

submission.to_csv('submission.csv', index=False)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- condition_index
- zip_area
Feature names seen at fit time, yet now missing:
- condition_index_1_True
- condition_index_2.0_True
- condition_index_2_True
- condition_index_3.0_True
- condition_index_3_True
- ...


In [54]:
cat_cols = ['condition_index', 'zip_area']

# Apply one-hot encoding to both train and test
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

# Align test set columns to train set
X_test = X_test.reindex(columns=X.columns, fill_value=0)


KeyError: "None of [Index(['condition_index', 'zip_area'], dtype='object')] are in the [columns]"

In [55]:
# Show columns containing 'condition' or 'zip'
[c for c in X.columns if 'condition' in c or 'zip' in c]


['condition_index__4_True',
 'condition_index__5_True',
 'condition_index_3_True',
 'condition_index_3_True',
 'condition_index_4_True',
 'condition_index_4_True',
 'condition_index_5_True',
 'condition_index_5_True',
 'condition_index_1_True',
 'condition_index_2_True',
 'condition_index_2.0_True',
 'condition_index_3_True',
 'condition_index_3_True',
 'condition_index_3.0_True',
 'condition_index_4_True',
 'condition_index_4_True',
 'condition_index_4.0_True',
 'condition_index_5_True',
 'condition_index_5_True',
 'condition_index_5.0_True',
 'zip_area__98003_True',
 'zip_area__98005_True',
 'zip_area__98006_True',
 'zip_area__98007_True',
 'zip_area__98008_True',
 'zip_area__98010_True',
 'zip_area__98014_True',
 'zip_area__98022_True',
 'zip_area__98027_True',
 'zip_area__98028_True',
 'zip_area__98029_True',
 'zip_area__98030_True',
 'zip_area__98031_True',
 'zip_area__98032_True',
 'zip_area__98033_True',
 'zip_area__98034_True',
 'zip_area__98038_True',
 'zip_area__98039_True',


In [56]:
# Align test set columns to train set
X_test = X_test.reindex(columns=X.columns, fill_value=0)


In [57]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


In [58]:
y_test_pred = model.predict(X_test)


In [61]:
# Ensure test set columns match training columns
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Predict using your trained model
y_test_pred = model.predict(X_test)

# Prepare submission file
submission = pd.DataFrame({
    'house_id': test['house_id'],        # from test dataset
    'predicted_price': y_test_pred       # your predictions
})

# Save with the competition's required naming format
submission.to_csv('EM12_Alpha_Minds_Task2_Predictions.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!


In [63]:
from google.colab import files
files.download('EM12_Alpha_Minds_Task2_Predictions.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>