In [24]:
def data_preprocessing(raw_data_path, prep_data_path, bucket):

    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder
    from sklearn.pipeline import make_pipeline,Pipeline
    from sklearn.compose import make_column_transformer
    from google.cloud import storage    
    from io import BytesIO

    # client = storage.Client()
    # file_name = raw_data
    # bucket = client.get_bucket(bucket_name)
    # blob = bucket.get_blob(raw_data)
    # content = blob.download_as_string()
    #df = pd.read_csv(BytesIO(content))
    df = pd.read_csv(bucket + raw_data_path)


    #deleting unwanted columns
    drop_columns = ['id','url', 'region', 'region_url','model','title_status', 'title_status','county', 'vin', 'description','size', 'image_url', 'lat','long','state','paint_color','cylinders']
    df = df.drop(columns=drop_columns)
    #deleting rows with nan values
    df = df.dropna()
    #reformatting/cleaning numeric columns
    df['price'] = df['price'].astype(int)
    df['year'] = df['year'].astype(int)
    df['odometer'] = df['odometer'].astype(int)
    df['odometer'] = df['odometer'] // 5000
    df = df[df['year'] > 110]
    df = df[(df['price']>1000) & (df['price']<50000)]

    #reformatting/cleaning categorical columns
    df['manufacturer'] = df['manufacturer'].astype(str)
    df['condition'] = df['condition'].astype(str)
    # df['cylinders'] = df['cylinders'].astype(str)
    df['fuel'] = df['fuel'].astype(str)
    df['transmission'] = df['transmission'].astype(str)
    df['drive'] = df['drive'].astype(str)
    df['type'] = df['type'].astype(str)
    df=df[df['transmission']!='other']
    df=df.reset_index()
    
    #print(df['transmission'].value_counts())

    #label encode columns

    lab_cat_columns=['transmission']

#     for col in lab_cat_columns:
#         if col in df.columns:
#             le = LabelEncoder()
#             le.fit(list(df[col].astype(str).values))
#             df[col] = le.transform(list(df[col].astype(str).values))

    conditions = {'salvage': 0, 'fair': 1, 'good': 2, 'excellent': 3, 'like new': 4, 'new': 5}
    df['condition'].replace(conditions, inplace=True)
    
    transmissions={'automatic': 1, 'manual': 0}
    df['transmission'].replace(transmissions, inplace=True)      
    #Creating pipeline

    numerical_features=['year', 'odometer']
    one_hot_cat_columns=['manufacturer','fuel','drive','type']


    categoric_transformer = make_pipeline(OneHotEncoder(sparse=False,handle_unknown='ignore'))

    # Creating a pipeline with mean imputer for numerical data 
    numeric_transformer =  make_pipeline(StandardScaler())  

    #Creating label transformer

    # label_transformer=make_pipeline(LabelEncoder())

    # Combining both pipelines such that each pipeline works on the columns it was meant for
    preprocessor = make_column_transformer((categoric_transformer,one_hot_cat_columns),
                                            (numeric_transformer,numerical_features))
    #                                           (label_transformer,lab_cat_columns))

    pipe=Pipeline(steps = [('prep',preprocessor)])
    results=pipe.fit_transform(df)
    results=pd.DataFrame(data=results, columns=list(pd.get_dummies(df[one_hot_cat_columns]).columns)+numerical_features )

    final_df=results
    # final_df['year']=df['year']
    # final_df['odometer']=df['odometer']
    final_df['condition']=df['condition']
    final_df['transmission']=df['transmission']
    final_df['price']=df['price']
    
    columns_series = pd.Series(final_df.columns, dtype='string', name='train_df_columns')
    columns_series.to_csv(bucket + '/data/train_columns.csv', index=False)

#     final_df.to_csv(bucket + prep_data_path)
#     return prep_data_path

In [25]:
data_preprocessing('/data/raw_vehicles.csv', '/data/prep_vehicles.csv', 'gs://de-3')

In [19]:
def rf_model_training(prep_data_path, bucket, bucket_name, model_path):
    
    import pandas as pd
    import numpy as np
    from google.cloud import storage  
    from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV, StratifiedKFold
    import _pickle as cPickle 
    from sklearn import metrics
    from sklearn.ensemble import RandomForestRegressor  
    from urllib.parse import urlparse

    df = pd.read_csv(bucket + prep_data_path)

    #Seperating dataset and target variable
    target_name = 'price'
    df_target = df[target_name]
    df = df.drop([target_name], axis=1)
    #Train test split
    train, test, target, target_test = train_test_split(df, df_target, test_size=0.2, random_state=0)
    #return [train, test, target, target_test]

    #read preprocessed data
    #train, test, target, target_test = _read_and_split_data(prep_data_path, bucket)
    #Tuning RF Parameters
    rf_param_grid = {'n_estimators': [100,500],
                'max_features': [0.2,0.7]
                }
    rf_GS = GridSearchCV(RandomForestRegressor(n_jobs=-1), param_grid=rf_param_grid,
                    cv=ShuffleSplit(n_splits=3,random_state=1), verbose=False, pre_dispatch='2*n_jobs')

    rf_GS.fit(train, target)


    score=rf_GS.score(train, target)
    y_pred=rf_GS.predict(test)
    #print('R^2 on the train set', score)
    print('R2 score', metrics.r2_score(target_test, y_pred))

    temp_model_path='/tmp/rf_model.pickle'
    with open(temp_model_path, 'wb') as f:
        cPickle.dump(rf_GS, f, -1)
    
    parse = urlparse(url=bucket+model_path, allow_fragments=False)
    print(parse.path)
    print(parse.netloc)
    
    if parse.path[0] =='/':
        parsed_model_path = parse.path[1:]
    client = storage.Client()
    bucket = client.get_bucket(parse.netloc)
    model = bucket.blob(parsed_model_path)
    model.upload_from_filename(temp_model_path)
    return model_path

In [20]:
rf_model_training('/data/prep_vehicles.csv', 'gs://de-3','de-3', '/models/rf_model.pickle')

R2 score 0.7816622118347524
/models/rf_model.pickle
de-3


'/models/rf_model.pickle'

In [19]:
def write_best_model():
#     best_model_file = open(r"/best_model.txt","w+")
#     best_model_file.write('models/xgb_model.pickle')
#     best_model_file.close()
    import _pickle as cPickle
    from google.cloud import storage  
    path = 'models/xgb_model.pickle'
    temp_model_path='/tmp/best_model.pickle'
    with open(temp_model_path, 'wb') as f:
        cPickle.dump(path, f, -1)
        
    client = storage.Client()
    bucket = client.get_bucket('de-3')
    model = bucket.blob('')
    model.upload_from_filename(temp_model_path)

In [20]:
write_best_model()

In [15]:
import pandas as pd

car_dicts = [{'manufacturer': 'bmw', 'fuel': 'gas', 'drive': 'rwd', 'type':'coupe', 
              'transmission':'automatic', 'condition':'new', 'year': 2010, 'odometer': 50000 }]

df_cols = ['manufacturer_acura', 'manufacturer_audi', 'manufacturer_bmw',
       'manufacturer_buick', 'manufacturer_cadillac', 'manufacturer_chevrolet',
       'manufacturer_chrysler', 'manufacturer_dodge', 'manufacturer_fiat',
       'manufacturer_ford', 'manufacturer_gmc', 'manufacturer_honda',
       'manufacturer_hyundai', 'manufacturer_infiniti', 'manufacturer_jaguar',
       'manufacturer_jeep', 'manufacturer_kia', 'manufacturer_lexus',
       'manufacturer_lincoln', 'manufacturer_mazda',
       'manufacturer_mercedes-benz', 'manufacturer_mercury',
       'manufacturer_mini', 'manufacturer_mitsubishi', 'manufacturer_nissan',
       'manufacturer_pontiac', 'manufacturer_ram', 'manufacturer_rover',
       'manufacturer_saturn', 'manufacturer_subaru', 'manufacturer_tesla',
       'manufacturer_toyota', 'manufacturer_volkswagen', 'manufacturer_volvo',
       'fuel_diesel', 'fuel_electric', 'fuel_gas', 'fuel_hybrid', 'fuel_other',
       'drive_4wd', 'drive_fwd', 'drive_rwd', 'type_SUV', 'type_bus',
       'type_convertible', 'type_coupe', 'type_hatchback', 'type_mini-van',
       'type_offroad', 'type_other', 'type_pickup', 'type_sedan', 'type_truck',
       'type_van', 'type_wagon', 'year', 'odometer', 'condition',
       'transmission']

conditions = {'salvage': 0, 'fair': 1, 'good': 2, 'excellent': 3, 'like new': 4, 'new': 5}
transmissions = {'manual': 0, 'automatic': 1}


encoded_df = pd.DataFrame(columns=df_cols)

for car in car_dicts:
    new_row_dict = {}
    for col in df_cols:
        if 'manufacturer' in col:
            if car['manufacturer'] in col:
                new_row_dict[col]=1
            else: 
                new_row_dict[col]=0
        if 'fuel' in col:
            if car['fuel'] in col:
                new_row_dict[col]=1
            else: 
                new_row_dict[col]=0                
        elif 'drive' in col:
            if car['drive'] in col:
                new_row_dict[col]=1
            else: 
                new_row_dict[col]=0        
        elif 'type' in col:
            if car['type'] in col:
                new_row_dict[col]=1
            else: 
                new_row_dict[col]=0        
        elif col=='condition':
            new_row_dict['condition']= conditions[car['condition']]        
        elif col=='transmission':
            new_row_dict['transmission']= transmissions[car['transmission']]
        elif col=='year': 
            new_row_dict[col]= car[col]
        elif col=='odometer': 
            new_row_dict[col]= car[col]
    #print (len(new_row_dict.keys()))
    encoded_df = encoded_df.append(new_row_dict, ignore_index=True)
encoded_df = encoded_df.astype('float64')   
min_odometer = encoded_df['odometer'].idxmin()
print(encoded_df['year'][min_odometer])

2010.0


In [12]:
import pandas as pd
from flask import Flask, json, request, Response
import _pickle as cPickle

import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.compose import make_column_transformer

#from resources import predictor

app = Flask(__name__)
app.config["DEBUG"] = True

def preprocess(new_cars): 
    #label encode columns

    df_cols = ['manufacturer_acura', 'manufacturer_audi', 'manufacturer_bmw',
        'manufacturer_buick', 'manufacturer_cadillac', 'manufacturer_chevrolet',
        'manufacturer_chrysler', 'manufacturer_dodge', 'manufacturer_fiat',
        'manufacturer_ford', 'manufacturer_gmc', 'manufacturer_honda',
        'manufacturer_hyundai', 'manufacturer_infiniti', 'manufacturer_jaguar',
        'manufacturer_jeep', 'manufacturer_kia', 'manufacturer_lexus',
        'manufacturer_lincoln', 'manufacturer_mazda',
        'manufacturer_mercedes-benz', 'manufacturer_mercury',
        'manufacturer_mini', 'manufacturer_mitsubishi', 'manufacturer_nissan',
        'manufacturer_pontiac', 'manufacturer_ram', 'manufacturer_rover',
        'manufacturer_saturn', 'manufacturer_subaru', 'manufacturer_tesla',
        'manufacturer_toyota', 'manufacturer_volkswagen', 'manufacturer_volvo',
        'fuel_diesel', 'fuel_electric', 'fuel_gas', 'fuel_hybrid', 'fuel_other',
        'drive_4wd', 'drive_fwd', 'drive_rwd', 'type_SUV', 'type_bus',
        'type_convertible', 'type_coupe', 'type_hatchback', 'type_mini-van',
        'type_offroad', 'type_other', 'type_pickup', 'type_sedan', 'type_truck',
        'type_van', 'type_wagon', 'year', 'odometer', 'condition',
        'transmission']

    conditions = {'salvage': 0, 'fair': 1, 'good': 2, 'excellent': 3, 'like new': 4, 'new': 5}
    transmissions = {'manual': 0, 'automatic': 1}


    encoded_df = pd.DataFrame(columns=df_cols)

    for car in new_cars:
        new_row_dict = {}
        for col in df_cols:
            if 'manufacturer' in col:
                if car['manufacturer'] in col:
                    new_row_dict[col]=1
                else: 
                    new_row_dict[col]=0
            if 'fuel' in col:
                if car['fuel'] in col:
                    new_row_dict[col]=1
                else: 
                    new_row_dict[col]=0                
            elif 'drive' in col:
                if car['drive'] in col:
                    new_row_dict[col]=1
                else: 
                    new_row_dict[col]=0        
            elif 'type' in col:
                if car['type'] in col:
                    new_row_dict[col]=1
                else: 
                    new_row_dict[col]=0        
            elif col=='condition':
                new_row_dict['condition']= conditions[car['condition']]        
            elif col=='transmission':
                new_row_dict['transmission']= transmissions[car['transmission']]
            elif col=='year': 
                new_row_dict[col]= car[col]
            elif col=='odometer': 
                new_row_dict[col]= car[col]
        #print (new_row_dict)
        encoded_df = encoded_df.append(new_row_dict, ignore_index=True)
        
    encoded_df = encoded_df.astype('float64')
    min_odometer = encoded_df['odometer'].idxmin()
    print(encoded_df['score'][min_odometer])
    return encoded_df


@app.route('/price-predict', methods=['POST'])
def predict_perf():
    content = request.get_json()
    #print(content)

    #test_df = pd.read_json(json.dumps(content), orient='records')

    js_str_ = json.dumps(content)
    dicts = json.loads(js_str)
    #print(type(dict_[0]['sepal_length']))

    prep_cars = preprocess(dicts)

    client = storage.Client()
    bucket = client.get_bucket('de-3')
    blob = bucket.get_blob('models/xgb_model.pickle')
    if blob is None:
        raise AttributeError('No files to download') 
    model_bytestream = BytesIO(blob.download_as_string())
    model = cPickle.load(model_bytestream)

    result = model.predict(prep_cars)
    js_result=json.dumps(result.to_dict(orient='records'), indent=4, sort_keys=False)

    resp = Response(js_result, status=200, mimetype='application/json')
    resp.headers['Access-Control-Allow-Origin'] = '*'
    resp.headers['Access-Control-Allow-Methods'] = 'POST'
    resp.headers['Access-Control-Max-Age'] = '1000'
    return resp
    


app.run(host='0.0.0.0', port=5000)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
 * Restarting with inotify reloader


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
