In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

home_data = pd.read_csv("home-data-for-ml-course/train.csv")

y = home_data.SalePrice

features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

X = home_data[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

rf_model = RandomForestRegressor(random_state=1, n_estimators=500)
rf_model.fit(train_X, train_y)
rf_val_prediction = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_prediction, val_y)
rf_val_rmse = np.sqrt(mean_squared_error(rf_val_prediction, val_y))

print("validation mae for random forest model: {:,.0f}".format(rf_val_mae))
print("validation rmse for random forest model: {:,.0f}".format(rf_val_rmse))


validation mae for random forest model: 21,887
validation rmse for random forest model: 31,387


In [2]:
rf_model_on_full_data = RandomForestRegressor(random_state=1)
rf_model_on_full_data.fit(X, y)

test_data = pd.read_csv("home-data-for-ml-course/test.csv")

test_X = test_data[features]
test_pred = rf_model_on_full_data.predict(test_X)


In [3]:
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_pred})
output

Unnamed: 0,Id,SalePrice
0,1461,122656.58
1,1462,156789.00
2,1463,182959.00
3,1464,178102.00
4,1465,189049.48
...,...,...
1454,2915,83645.00
1455,2916,86785.00
1456,2917,151283.01
1457,2918,127878.00


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("melb_data.csv")

y = data.Price

In [5]:
melb_predictors = data.drop(['Price'], axis=1)

In [9]:
X = melb_predictors.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [15]:
cols_with_missing_values = [col for col in X_train.columns if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing_values, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing_values, axis=1)

print("MAE from approach 1 (drop cols with missing values): ")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train,  y_valid))


MAE from approach 1 (drop cols with missing values): 
183550.22137772635


In [17]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()

my_imputer.fit(X_train)
imputed_X_train = pd.DataFrame(my_imputer.transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from approach 2 (imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))


MAE from approach 2 (imputation):
178166.46269899711


In [18]:
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

for col in cols_with_missing_values:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_train_plus))

imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from approach 3 ( extension to imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

    



MAE from approach 3 ( extension to imputation):
178166.46269899711


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("melb_data.csv")
y = data.Price
X = data.drop(['Price'], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]

X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numerical_cols
X_train =  X_train_full[my_cols].copy()
X_valid =  X_valid_full[my_cols].copy()

In [23]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
1041,h,S,Southern Metropolitan,3,11.2,3186.0,3.0,1.0,366.0,-37.9038,145.0001,10579.0
1989,h,S,Northern Metropolitan,3,7.8,3058.0,3.0,1.0,238.0,-37.7539,144.9612,11204.0
10157,h,S,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,439.0,-37.77047,144.97005,11918.0
1711,u,S,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,0.0,-37.8863,145.066,7822.0
11565,h,S,Western Metropolitan,4,11.0,3018.0,4.0,2.0,615.0,-37.87057,144.83623,5301.0


In [25]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)


Categorical variables:
['Type', 'Method', 'Regionname']


In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [29]:
from sklearn.preprocessing import OrdinalEncoder

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

oridinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = oridinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = oridinal_encoder.transform(X_valid[object_cols])

print("MAE from approach 2 (ordinal encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from approach 2 (ordinal encoding):
164359.03402763166


In [30]:
from sklearn.preprocessing import LabelEncoder

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

for c in object_cols:
    label_encoder = LabelEncoder()
    label_X_train[c] = label_encoder.fit_transform(X_train[c])
    label_X_valid[c] = label_encoder.transform(X_valid[c])

print("MAE from approach 2 (label encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from approach 2 (label encoding):
164359.03402763166


In [39]:
from sklearn.preprocessing import OneHotEncoder

# Convert all column names to strings
X_train.columns = X_train.columns.astype(str)
X_valid.columns = X_valid.columns.astype(str)

# OneHot Encoding
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# Restore the indices
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove original categorical columns
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Concatenate numeric and encoded categorical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Evaluate the model
print("MAE from approach 3 (one-hot encoding):")
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))



MAE from approach 3 (one-hot encoding):


TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [51]:
# NEW EXCERCISE

In [69]:
import pandas as pd 
from sklearn.model_selection import train_test_split

X = pd.read_csv("home-data-for-ml-course/train.csv", index_col='Id')
X_test = pd.read_csv("home-data-for-ml-course/test.csv", index_col='Id')

X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(["SalePrice"], axis=1, inplace=True)

cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)



In [71]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [73]:
cat_col = [ c for c in X_train.columns if X_train[c].dtype == "object"]
drop_X_train = X_train.drop(cat_col, axis=1)
drop_X_valid = X_valid.drop(cat_col, axis=1)

In [75]:
print("MAE (drop categorical variable):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE (drop categorical variable):
17837.82570776256


In [77]:
object_cols = [col for col in X_train.columns if X_train[col].dtype =='object']

good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))]

bad_label_col = list(set(object_cols) - set(good_label_cols))

In [81]:
from sklearn.preprocessing import OrdinalEncoder

# Select good label columns
label_X_train = X_train.drop(bad_label_col, axis=1)
label_X_valid = X_valid.drop(bad_label_col, axis=1)

# Apply Ordinal Encoder
ord_train_data = X_train[good_label_cols].copy()
ord_valid_data = X_valid[good_label_cols].copy()

ordinal_encoder = OrdinalEncoder()
ord_train_data = pd.DataFrame(ordinal_encoder.fit_transform(ord_train_data))
ord_valid_data = pd.DataFrame(ordinal_encoder.transform(ord_valid_data))

# Convert column names to strings to avoid mixed types
ord_train_data.columns = ord_train_data.columns.astype(str)
ord_valid_data.columns = ord_valid_data.columns.astype(str)

# Restore the indices
ord_train_data.index = X_train.index
ord_valid_data.index = X_valid.index

# Drop encoded columns and concatenate encoded data
label_X_train = label_X_train.drop(good_label_cols, axis=1)
label_X_valid = label_X_valid.drop(good_label_cols, axis=1)

label_X_train = pd.concat([label_X_train, ord_train_data], axis=1)
label_X_valid = pd.concat([label_X_valid, ord_valid_data], axis=1)

# Ensure final DataFrames have string column names
label_X_train.columns = label_X_train.columns.astype(str)
label_X_valid.columns = label_X_valid.columns.astype(str)

# Evaluate the model
print("MAE from approach 2 (ordinal encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))



MAE from approach 2 (ordinal encoding):
17217.052374429226


In [7]:
# pipline tutorial

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("melb_data.csv")
y = data.Price
X = data.drop(['Price'], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [13]:
preprocessor

In [15]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [17]:
from sklearn.metrics import mean_absolute_error

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 160679.18917034855


In [19]:
# another_example

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split

X_full = pd.read_csv('home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('home-data-for-ml-course/test.csv', index_col='Id')

X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols  + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()


In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = RandomForestRegressor(n_estimators=100, random_state=0)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

clf.fit(X_train, y_train)

preds = clf.predict(X_valid)

print("MAE:", mean_absolute_error(y_valid, preds))

MAE: 17614.81993150685


In [1]:
# new exercise

In [5]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

data = pd.read_csv("melb_data.csv")

cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

y = data.Price

my_pipeline = Pipeline(steps=[
    ('preprocessor', SimpleImputer()),
    ('model', RandomForestRegressor(n_estimators=50, random_state=0))
])

scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores)





MAE scores:
 [301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]


In [7]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

train_data = pd.read_csv('home-data-for-ml-course/train.csv', index_col='Id')
test_data = pd.read_csv('home-data-for-ml-course/test.csv', index_col='Id')

train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = train_data.SalePrice

train_data.drop(['SalePrice'], axis=1, inplace=True)

numerical_cols = [cname for cname  in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numerical_cols].copy()
X_test = test_data[numerical_cols].copy()

my_pipeline = Pipeline(steps=[
    ('preprocessor', SimpleImputer()),
    ('model', RandomForestRegressor(n_estimators=100, random_state=0))
])

scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
print("average MAE:", scores.mean())


average MAE: 18054.97936073059


In [9]:
# write a function

In [11]:
def get_score(n_estimators):
    my_pipeline = Pipeline(steps=[
    ('preprocessor', SimpleImputer()),
    ('model', RandomForestRegressor(n_estimators=n_estimators, random_state=0))
    ])
    scores = -1 * cross_val_score(my_pipeline, X, y, cv=3, scoring='neg_mean_absolute_error')
    return np.mean(scores)

In [15]:
# Gradient Boosting

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

data = pd.read_csv("melb_data.csv")

cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

y = data.Price

X_train, X_valid, y_train, y_valid = train_test_split(X, y)

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)

predictions = my_model.predict(X_valid)
print("MAE:", mean_absolute_error(predictions, y_valid))

TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [36]:
git init

SyntaxError: invalid syntax (2830201818.py, line 1)