In [10]:
def data_preprocessing(raw_data_path, prep_data_path, bucket):

    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder
    from sklearn.pipeline import make_pipeline,Pipeline
    from sklearn.compose import make_column_transformer
    from google.cloud import storage    
    from io import BytesIO

    # client = storage.Client()
    # file_name = raw_data
    # bucket = client.get_bucket(bucket_name)
    # blob = bucket.get_blob(raw_data)
    # content = blob.download_as_string()
    #df = pd.read_csv(BytesIO(content))
    df = pd.read_csv(bucket + raw_data_path)


    #deleting unwanted columns
    drop_columns = ['id','url', 'region', 'region_url','model','title_status', 'title_status','county', 'vin', 'description','size', 'image_url', 'lat','long','state','paint_color','cylinders']
    df = df.drop(columns=drop_columns)
    #deleting rows with nan values
    df = df.dropna()
    #reformatting/cleaning numeric columns
    df['price'] = df['price'].astype(int)
    df['year'] = df['year'].astype(int)
    df['odometer'] = df['odometer'].astype(int)
    df['odometer'] = df['odometer'] // 5000
    df = df[df['year'] > 110]
    df = df[(df['price']>1000) & (df['price']<50000)]

    #reformatting/cleaning categorical columns
    df['manufacturer'] = df['manufacturer'].astype(str)
    df['condition'] = df['condition'].astype(str)
    # df['cylinders'] = df['cylinders'].astype(str)
    df['fuel'] = df['fuel'].astype(str)
    df['transmission'] = df['transmission'].astype(str)
    df['drive'] = df['drive'].astype(str)
    df['type'] = df['type'].astype(str)
    df=df[df['transmission']!='other']
    df=df.reset_index()

    #label encode columns

    lab_cat_columns=['condition','transmission']

    for col in lab_cat_columns:
        if col in df.columns:
            le = LabelEncoder()
            le.fit(list(df[col].astype(str).values))
            df[col] = le.transform(list(df[col].astype(str).values))
            
    #Creating pipeline

    numerical_features=['year', 'odometer']
    one_hot_cat_columns=['manufacturer','fuel','drive','type']


    categoric_transformer = make_pipeline(OneHotEncoder(sparse=False,handle_unknown='ignore'))

    # Creating a pipeline with mean imputer for numerical data 
    numeric_transformer =  make_pipeline(StandardScaler())  

    #Creating label transformer

    # label_transformer=make_pipeline(LabelEncoder())

    # Combining both pipelines such that each pipeline works on the columns it was meant for
    preprocessor = make_column_transformer((categoric_transformer,one_hot_cat_columns),
                                            (numeric_transformer,numerical_features))
    #                                           (label_transformer,lab_cat_columns))

    pipe=Pipeline(steps = [('prep',preprocessor)])
    results=pipe.fit_transform(df)
    results=pd.DataFrame(data=results, columns=list(pd.get_dummies(df[one_hot_cat_columns]).columns)+numerical_features )

    final_df=results
    # final_df['year']=df['year']
    # final_df['odometer']=df['odometer']
    final_df['condition']=df['condition']
    final_df['transmission']=df['transmission']
    final_df['price']=df['price']

    final_df.to_csv(bucket + prep_data_path)
    return prep_data_path

In [11]:
data_preprocessing('/data/raw_vehicles.csv', '/data/prep_vehicles.csv', 'gs://de-3')

'/data/prep_vehicles.csv'

In [19]:
def rf_model_training(prep_data_path, bucket, bucket_name, model_path):
    
    import pandas as pd
    import numpy as np
    from google.cloud import storage  
    from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV, StratifiedKFold
    import _pickle as cPickle 
    from sklearn import metrics
    from sklearn.ensemble import RandomForestRegressor  
    from urllib.parse import urlparse

    df = pd.read_csv(bucket + prep_data_path)

    #Seperating dataset and target variable
    target_name = 'price'
    df_target = df[target_name]
    df = df.drop([target_name], axis=1)
    #Train test split
    train, test, target, target_test = train_test_split(df, df_target, test_size=0.2, random_state=0)
    #return [train, test, target, target_test]

    #read preprocessed data
    #train, test, target, target_test = _read_and_split_data(prep_data_path, bucket)
    #Tuning RF Parameters
    rf_param_grid = {'n_estimators': [100,500],
                'max_features': [0.2,0.7]
                }
    rf_GS = GridSearchCV(RandomForestRegressor(n_jobs=-1), param_grid=rf_param_grid,
                    cv=ShuffleSplit(n_splits=3,random_state=1), verbose=False, pre_dispatch='2*n_jobs')

    rf_GS.fit(train, target)


    score=rf_GS.score(train, target)
    y_pred=rf_GS.predict(test)
    #print('R^2 on the train set', score)
    print('R2 score', metrics.r2_score(target_test, y_pred))

    temp_model_path='/tmp/rf_model.pickle'
    with open(temp_model_path, 'wb') as f:
        cPickle.dump(rf_GS, f, -1)
    
    parse = urlparse(url=bucket+model_path, allow_fragments=False)
    print(parse.path)
    print(parse.netloc)
    
    if parse.path[0] =='/':
        parsed_model_path = parse.path[1:]
    client = storage.Client()
    bucket = client.get_bucket(parse.netloc)
    model = bucket.blob(parsed_model_path)
    model.upload_from_filename(temp_model_path)
    return model_path

In [20]:
rf_model_training('/data/prep_vehicles.csv', 'gs://de-3','de-3', '/models/rf_model.pickle')

R2 score 0.7816622118347524
/models/rf_model.pickle
de-3


'/models/rf_model.pickle'