# Production Dataset

In [17]:
import pandas as pd
import numpy as np
import uuid
import random
from faker import Faker
import string
from datetime import datetime, timedelta, date

In [2]:
df = pd.read_csv("vehicles.csv")

In [3]:
def fill_na(column):
    actual = column.dropna().tolist()
    return column.apply(lambda x: x if pd.notna(x) else random.choice(actual))

def fill_na_vin(column):
    actual = column.dropna().tolist()
    vin_chars = string.ascii_uppercase.replace('I', '').replace('O', '').replace('Q', '') + string.digits

    def generate_random_vin():
        return ''.join(random.choices(vin_chars, k=17))
    
    return column.apply(lambda x: x if pd.notna(x) else generate_random_vin())

def generate_random_dates(n, start_date, end_date):
    start_u = start_date.value // 10**9
    end_u = end_date.value // 10**9
    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

In [4]:
df = df[27:]
df = df.reset_index().drop(["county", "size", "index"], axis=1)

In [5]:
df['drive'] = fill_na(df['drive'])
df['cylinders'] = fill_na(df['cylinders'])
df['paint_color'] = fill_na(df['paint_color'])
df['model'] = fill_na(df['model'])
df['manufacturer'] = fill_na(df['manufacturer'])
df['condition'] = fill_na(df['condition'])
df['fuel'] = fill_na(df['fuel'])
df['type'] = fill_na(df['type'])
df['VIN'] = fill_na_vin(df['VIN'])

avg_price = df["price"].mean()
df["price"] = (df["price"] / avg_price) * 200

df['cylinders'] = df['cylinders'].str.extract('(\d+)').astype(float)
df['cylinders'].fillna(df['cylinders'].dropna().sample(n=1).values[0], inplace=True)
df['cylinders'] = df['cylinders'].astype(int)


df['paint_color'] = df['paint_color'].replace('custom', 'white')
df['transmission'] = df['transmission'].replace('other', 'automatic')

df = df.rename(columns={
    "VIN": "vin"
})

end_date = pd.to_datetime('now')
start_date = end_date - pd.DateOffset(years=1)
df['posting_date'] = generate_random_dates(len(df), start_date, end_date)

df

  result, tz_parsed = tslib.array_to_datetime(


Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,vin,drive,type,paint_color,image_url,description,state,lat,long,posting_date
0,7316814884,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,89.331452,2014.0,gmc,sierra 1500 crew cab slt,good,8,...,3GTP1VEC4EG551563,fwd,pickup,white,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,al,32.590000,-85.480000,2024-04-14 20:23:00
1,7316814758,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,60.077330,2010.0,chevrolet,silverado 1500,good,8,...,1GCSCSE06AZ123805,fwd,pickup,blue,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,al,32.590000,-85.480000,2024-03-28 02:02:06
2,7316814989,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,105.288246,2020.0,chevrolet,silverado 1500 crew,good,8,...,3GCPWCED5LG130317,4wd,pickup,red,https://images.craigslist.org/01212_jjirIWa0y0...,Carvana is the safer way to buy a car During t...,al,32.590000,-85.480000,2024-05-26 02:16:27
3,7316743432,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,82.416841,2017.0,toyota,tundra double cab sr,good,8,...,5TFRM5F17HX120972,4wd,pickup,red,https://images.craigslist.org/00x0x_1y9kIOzGCF...,Carvana is the safer way to buy a car During t...,al,32.590000,-85.480000,2024-03-30 10:44:21
4,7316356412,https://auburn.craigslist.org/cto/d/auburn-uni...,auburn,https://auburn.craigslist.org,39.891985,2013.0,ford,f-150 xlt,excellent,6,...,AZDSDKZ6GUDWHK4A1,rwd,truck,black,https://images.craigslist.org/00404_l4loxHvdQe...,2013 F-150 XLT V6 4 Door. Good condition. Leve...,al,32.592000,-85.518900,2023-07-29 09:25:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426848,7301591192,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://wyoming.craigslist.org,62.736795,2019.0,nissan,maxima s sedan 4d,good,6,...,1N4AA6AV6KC367801,fwd,sedan,silver,https://images.craigslist.org/00o0o_iiraFnHg8q...,Carvana is the safer way to buy a car During t...,wy,33.786500,-84.445400,2023-12-06 21:24:33
426849,7301591187,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://wyoming.craigslist.org,81.353055,2020.0,volvo,s60 t5 momentum sedan 4d,good,5,...,7JR102FKXLG042696,fwd,sedan,red,https://images.craigslist.org/00x0x_15sbgnxCIS...,Carvana is the safer way to buy a car During t...,wy,33.786500,-84.445400,2023-08-30 02:18:07
426850,7301591147,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://wyoming.craigslist.org,93.054704,2020.0,cadillac,xt4 sport suv 4d,good,6,...,1GYFZFR46LF088296,4wd,hatchback,white,https://images.craigslist.org/00L0L_farM7bxnxR...,Carvana is the safer way to buy a car During t...,wy,33.779214,-84.411811,2023-09-26 05:19:55
426851,7301591140,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://wyoming.craigslist.org,77.097910,2018.0,lexus,es 350 sedan 4d,good,6,...,58ABK1GG4JU103853,fwd,sedan,silver,https://images.craigslist.org/00z0z_bKnIVGLkDT...,Carvana is the safer way to buy a car During t...,wy,33.786500,-84.445400,2024-02-07 16:48:36


In [6]:
df.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'vin', 'drive', 'type', 'paint_color', 'image_url',
       'description', 'state', 'lat', 'long', 'posting_date'],
      dtype='object')

In [7]:
unique_cars = df[['manufacturer', 'model']].drop_duplicates().reset_index(drop=True)
unique_cars['car_id'] = range(1, len(unique_cars) + 1)

cars_table = df[["id", "manufacturer", "model", "drive", "cylinders", "transmission", "paint_color"]]

cars_table = cars_table.merge(unique_cars, on=['manufacturer', 'model'], how='left').drop_duplicates().reset_index(drop=True)

cars_table

Unnamed: 0,id,manufacturer,model,drive,cylinders,transmission,paint_color,car_id
0,7316814884,gmc,sierra 1500 crew cab slt,4wd,8,automatic,white,1
1,7316814758,chevrolet,silverado 1500,rwd,8,automatic,blue,2
2,7316814989,chevrolet,silverado 1500 crew,rwd,8,automatic,red,3
3,7316743432,toyota,tundra double cab sr,4wd,8,automatic,red,4
4,7316356412,ford,f-150 xlt,rwd,6,automatic,black,5
...,...,...,...,...,...,...,...,...
426848,7301591192,nissan,maxima s sedan 4d,fwd,6,automatic,black,1100
426849,7301591187,volvo,s60 t5 momentum sedan 4d,fwd,4,automatic,red,56
426850,7301591147,cadillac,xt4 sport suv 4d,fwd,4,automatic,white,28
426851,7301591140,lexus,es 350 sedan 4d,fwd,6,automatic,silver,1292


In [8]:
colors = df["paint_color"].dropna().unique()
colors_table = pd.DataFrame(colors, columns=["color"])

hex_dict = {
    'black': '#000000',
    'blue': '#0000FF',
    'brown': '#A52A2A',
    'green': '#008000',
    'grey': '#808080',
    'orange': '#FFA500',
    'purple': '#800080',
    'red': '#FF0000',
    'silver': '#C0C0C0',
    'white': '#FFFFFF',
    'yellow': '#FFFF00',
}

colors_table['hex'] = colors_table['color'].map(hex_dict)

colors_table

Unnamed: 0,color,hex
0,white,#FFFFFF
1,blue,#0000FF
2,red,#FF0000
3,black,#000000
4,silver,#C0C0C0
5,grey,#808080
6,brown,#A52A2A
7,green,#008000
8,yellow,#FFFF00
9,orange,#FFA500


In [9]:
fake = Faker()

num_users = 500

user_data = {
    'user_id': random.sample(range(10000000, 100000000), num_users), # 8 digit user_id
    'first_name': [fake.first_name() for _ in range(num_users)],
    'last_name': [fake.last_name() for _ in range(num_users)],
    'email': [fake.email() for _ in range(num_users)],
    'date_of_birth': [fake.date_of_birth(minimum_age=18, maximum_age=90) for _ in range(num_users)],
    'password': [fake.password(length=10, special_chars=True, digits=True, upper_case=True, lower_case=True) for _ in range(num_users)]
}

users_table = pd.DataFrame(user_data)
users_table

Unnamed: 0,user_id,first_name,last_name,email,date_of_birth,password
0,27212364,Brittany,Murphy,maria76@example.net,1984-09-15,%QSqQ5nq(7
1,32317483,Brandon,Ayala,vpace@example.com,1969-11-14,O!9O4qF8(I
2,34518768,Alexandra,Parker,jacqueline86@example.net,1951-09-05,r238YEmZ*W
3,73347600,Jessica,Sandoval,mgardner@example.org,1961-01-05,w#z4NzZm3+
4,19650163,Latoya,Neal,cooperstephanie@example.net,1941-12-07,VolIKelE^3
...,...,...,...,...,...,...
495,31874198,William,Thomas,robertsonjeremiah@example.org,1956-06-09,%gP3NJWz%L
496,75466571,Adam,Gibbs,brandonjones@example.com,1945-04-18,k3iPkqpe_5
497,69531356,Kristy,Gray,victoranderson@example.org,1992-07-28,C0KzPRlV+(
498,56385499,Karen,Williams,jonathan29@example.net,1949-07-15,aIj87hwiE*


In [10]:
num_rentals = 1000

rental_data = {
    'rental_id': random.sample(range(1000000000, 10000000000), num_rentals), # 10 digit rental_id
    'car_id': [random.choice(cars_table['car_id']) for _ in range(num_rentals)],
    'user_id': [random.choice(users_table['user_id']) for _ in range(num_rentals)],
    'rent_date': [fake.date_this_year(before_today=True, after_today=False) for _ in range(num_rentals)],
}

num_active = num_rentals // 2
num_inactive = num_rentals - num_active

# active rentals
active_rent_dates = [fake.date_this_year(before_today=True, after_today=False) for _ in range(num_active)]
active_return_dates = [(pd.to_datetime(rent_date) + pd.to_timedelta(random.randint(1, 50), unit='d')).strftime('%Y-%m-%d')
                       for rent_date in active_rent_dates]

# inactive rentals
start_date = datetime.now() - timedelta(days=365 * 10)
inactive_rent_dates = [fake.date_between_dates(date_start=start_date, date_end=datetime.now() - timedelta(days=50))
                       for _ in range(num_inactive)]
inactive_return_dates = [(pd.to_datetime(rent_date) + pd.to_timedelta(random.randint(1, 50), unit='d')).strftime('%Y-%m-%d')
                         for rent_date in inactive_rent_dates]

rental_data['rent_date'] = active_rent_dates + inactive_rent_dates
rental_data['return_date'] = active_return_dates + inactive_return_dates
rental_data['status'] = ['active'] * num_active + ['inactive'] * num_inactive
rentals_table = pd.DataFrame(rental_data)
rentals_table = rentals_table.sample(frac=1).reset_index(drop=True)

rentals_table

Unnamed: 0,rental_id,car_id,user_id,rent_date,return_date,status
0,1168937878,21707,88662574,2024-05-03,2024-05-04,active
1,1717525259,4218,45634953,2024-03-29,2024-04-07,active
2,2830163511,137,56303852,2024-02-05,2024-03-15,active
3,7304017928,11025,28278749,2017-04-09,2017-05-21,inactive
4,3585585996,2,32047348,2024-04-11,2024-05-29,active
...,...,...,...,...,...,...
995,2370153063,3312,46720321,2024-06-06,2024-07-22,active
996,6770061771,157,26245655,2024-01-15,2024-01-22,active
997,2019964361,2900,33218988,2022-01-17,2022-02-08,inactive
998,1698727538,227,88662574,2023-06-05,2023-07-13,inactive


In [12]:
listings_table = df[["id", "url", "region", "image_url", "description", "condition", "vin", "lat", "long", "state", "price", "posting_date"]]
listings_table["car_id"] = cars_table['car_id']

listings_table = listings_table.rename(columns={
    "url": "listing_url",
    "lat": "lat_id",
    "long": "long_id",
    "state": "state_id"
})

listings_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_table["car_id"] = cars_table['car_id']


Unnamed: 0,id,listing_url,region,image_url,description,condition,vin,lat_id,long_id,state_id,price,posting_date,car_id
0,7316814884,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,good,3GTP1VEC4EG551563,32.590000,-85.480000,al,89.331452,2023-07-28 16:49:09,1
1,7316814758,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,good,1GCSCSE06AZ123805,32.590000,-85.480000,al,60.077330,2024-04-02 13:25:13,2
2,7316814989,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://images.craigslist.org/01212_jjirIWa0y0...,Carvana is the safer way to buy a car During t...,good,3GCPWCED5LG130317,32.590000,-85.480000,al,105.288246,2024-03-23 01:04:24,3
3,7316743432,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://images.craigslist.org/00x0x_1y9kIOzGCF...,Carvana is the safer way to buy a car During t...,good,5TFRM5F17HX120972,32.590000,-85.480000,al,82.416841,2024-04-02 12:42:57,4
4,7316356412,https://auburn.craigslist.org/cto/d/auburn-uni...,auburn,https://images.craigslist.org/00404_l4loxHvdQe...,2013 F-150 XLT V6 4 Door. Good condition. Leve...,excellent,506XWT6HAFWG1BAT0,32.592000,-85.518900,al,39.891985,2023-07-20 04:22:18,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
426848,7301591192,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://images.craigslist.org/00o0o_iiraFnHg8q...,Carvana is the safer way to buy a car During t...,good,1N4AA6AV6KC367801,33.786500,-84.445400,wy,62.736795,2024-04-22 13:43:38,1100
426849,7301591187,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://images.craigslist.org/00x0x_15sbgnxCIS...,Carvana is the safer way to buy a car During t...,good,7JR102FKXLG042696,33.786500,-84.445400,wy,81.353055,2024-03-30 19:26:23,56
426850,7301591147,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://images.craigslist.org/00L0L_farM7bxnxR...,Carvana is the safer way to buy a car During t...,good,1GYFZFR46LF088296,33.779214,-84.411811,wy,93.054704,2024-06-09 23:37:38,28
426851,7301591140,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://images.craigslist.org/00z0z_bKnIVGLkDT...,Carvana is the safer way to buy a car During t...,good,58ABK1GG4JU103853,33.786500,-84.445400,wy,77.097910,2024-01-19 16:15:09,1292


In [13]:
import pandas as pd

state_data = {
    'state_id': ['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
                   'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma',
                   'mi', 'mn', 'ms', 'mo', 'mt', 'nc', 'ne', 'nv', 'nj', 'nm', 'ny',
                   'nh', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
                   'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'],
    'state_name': ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'District of Columbia', 
                   'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 
                   'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 
                   'Montana', 'North Carolina', 'Nebraska', 'Nevada', 'New Jersey', 'New Mexico', 'New York', 'New Hampshire', 
                   'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 
                   'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'],
    'time_zone': ['Central', 'Alaska', 'Mountain', 'Central', 'Pacific', 'Mountain', 'Eastern', 'Eastern', 'Eastern', 'Eastern', 
                  'Eastern', 'Hawaii-Aleutian', 'Mountain', 'Central', 'Eastern', 'Central', 'Central', 'Eastern', 'Central', 
                  'Eastern', 'Eastern', 'Eastern', 'Eastern', 'Central', 'Central', 'Central', 'Mountain', 'Eastern', 'Central', 
                  'Pacific', 'Eastern', 'Mountain', 'Eastern', 'Eastern', 'Central', 'Eastern', 'Central', 'Pacific', 'Eastern', 
                  'Eastern', 'Eastern', 'Central', 'Central', 'Central', 'Mountain', 'Eastern', 'Eastern', 'Pacific', 'Eastern', 
                  'Central', 'Mountain']
}

states_table = pd.DataFrame(state_data)
states_table

Unnamed: 0,state_id,state_name,time_zone
0,al,Alabama,Central
1,ak,Alaska,Alaska
2,az,Arizona,Mountain
3,ar,Arkansas,Central
4,ca,California,Pacific
5,co,Colorado,Mountain
6,ct,Connecticut,Eastern
7,dc,District of Columbia,Eastern
8,de,Delaware,Eastern
9,fl,Florida,Eastern


In [14]:
num_favorites = 750
favorite_data = {
    'id': [random.choice(cars_table['id']) for _ in range(num_favorites)],
    'user_id': [random.choice(users_table['user_id']) for _ in range(num_favorites)]
}

favorites_table = pd.DataFrame(favorite_data)
favorites_table['car_id'] = favorites_table['id'].map(cars_table.set_index('id')['car_id'].to_dict())

favorites_table

Unnamed: 0,id,user_id,car_id
0,7310314165,68558566,566
1,7315137757,80476560,239
2,7316138479,30931024,2846
3,7302194328,14129170,2
4,7315167224,39118733,1304
...,...,...,...
745,7309843508,31837777,656
746,7315676999,52395548,192
747,7314837762,40218374,33
748,7315150201,34887668,295


In [15]:
# # save da files
# cars_table.to_csv('cars_table.csv', index=False)
# colors_table.to_csv('colors_table.csv', index=False)
# users_table.to_csv('users_table.csv', index=False)
# rentals_table.to_csv('rentals_table.csv', index=False)
# listings_table.to_csv('listings_table.csv', index=False)
# favorites_table.to_csv('favorites_table.csv', index=False)
# states_table.to_csv('states_table.csv', index=False)

# Sample Dataset

In [67]:
sample_df = df.sample(n=20).reset_index(drop=True)
min_mileage = 5000
max_mileage = 200000
sample_df['mileage'] = [random.randint(min_mileage, max_mileage) for _ in range(len(sample_df))]

sample_df

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,drive,type,paint_color,image_url,description,state,lat,long,posting_date,mileage
0,7316070182,https://losangeles.craigslist.org/wst/cto/d/to...,los angeles,https://losangeles.craigslist.org,55.848779,2019.0,mini,cooper convertible,like new,3,...,fwd,convertible,grey,https://images.craigslist.org/00t0t_4kDxJECWkm...,SALVAGE TITLE **** Read the entire ad before ...,ca,33.8268,-118.3118,2023-09-11 03:06:50,71156
1,7315046692,https://washingtondc.craigslist.org/nva/ctd/d/...,"washington, DC",https://washingtondc.craigslist.org,18.071069,2012.0,ford,focus,excellent,6,...,4wd,hatchback,green,https://images.craigslist.org/00o0o_8BENW87q9C...,2012 FORD FOCUS Titanium Offered by: CarNo...,dc,38.50942,-77.3709,2023-07-17 07:28:04,156985
2,7309903977,https://prescott.craigslist.org/ctd/d/prescott...,prescott,https://prescott.craigslist.org,41.46107,2016.0,honda,Scion iM Hatchback 4D,good,6,...,fwd,hatchback,green,https://images.craigslist.org/00N0N_1xMPvfxRAI...,Carvana is the safer way to buy a car During t...,az,34.62,-112.42,2024-06-07 08:35:33,20476
3,7308144849,https://flagstaff.craigslist.org/ctd/d/atlanta...,flagstaff / sedona,https://flagstaff.craigslist.org,81.353055,2018.0,mercedes-benz,c-class c 300,good,4,...,rwd,coupe,black,https://images.craigslist.org/00Q0Q_l9XK2Jv1lk...,Carvana is the safer way to buy a car During t...,az,33.7865,-84.4454,2024-02-04 21:34:09,90796
4,7303112586,https://neworleans.craigslist.org/ctd/d/metair...,new orleans,https://neworleans.craigslist.org,0.0,2015.0,audi,a3,excellent,4,...,fwd,coupe,white,https://images.craigslist.org/01010_71u2BgzZjG...,★NO CREDIT ? ★BAD CREDIT ? ★ BANKRUPTCY ? ★NO ...,la,29.998919,-90.155354,2024-01-03 20:22:00,138424
5,7315272787,https://lexington.craigslist.org/ctd/d/ferguso...,lexington,https://lexington.craigslist.org,42.285504,2013.0,chevrolet,equinox,good,6,...,fwd,SUV,white,https://images.craigslist.org/00S0S_5brJ1PIoqu...,Toyota of Somerset address: 4195 U.S. 27 So...,ky,37.038043,-84.626784,2024-04-06 00:29:28,132593
6,7301590585,https://westmd.craigslist.org/ctd/d/atlanta-20...,western maryland,https://westmd.craigslist.org,87.735773,2015.0,chevrolet,ss sedan 4d,good,8,...,rwd,sedan,green,https://images.craigslist.org/01313_1pP38JoY6b...,Carvana is the safer way to buy a car During t...,md,33.7865,-84.4454,2023-09-14 10:31:55,162001
7,7314895829,https://harrisonburg.craigslist.org/ctd/d/harr...,harrisonburg,https://harrisonburg.craigslist.org,29.227528,2004.0,toyota,4runner 4wd,excellent,6,...,4wd,SUV,silver,https://images.craigslist.org/00S0S_dqJU2kUpL9...,♔ ♛ ♔ ♛ ♔ ♛ ♔ ♛ ♔ ♛ ♔ ROYAL PIKE MOTORS ...,va,38.4489,-78.8714,2023-09-23 10:24:11,177166
8,7307956205,https://treasure.craigslist.org/ctd/d/okeechob...,treasure coast,https://treasure.craigslist.org,49.732008,2011.0,ford,f-150 xlt,excellent,8,...,rwd,pickup,black,https://images.craigslist.org/00S0S_7SSwbU5v1o...,** 5.0 V8 Coyote Engine ** Florida Born & Rais...,fl,27.234099,-80.829894,2023-10-12 01:18:02,118364
9,7313362366,https://sanantonio.craigslist.org/ctd/d/san-an...,san antonio,https://sanantonio.craigslist.org,71.778979,2017.0,audi,a6 2.0t premium plus sedan,good,6,...,fwd,sedan,silver,https://images.craigslist.org/01010_fKE111yABH...,Carvana is the safer way to buy a car During t...,tx,29.45,-98.5,2023-09-05 23:44:58,110062


In [68]:
sample_df.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'vin', 'drive', 'type', 'paint_color', 'image_url',
       'description', 'state', 'lat', 'long', 'posting_date', 'mileage'],
      dtype='object')

In [69]:
unique_cars = sample_df[['manufacturer', 'model']].drop_duplicates().reset_index(drop=True)

sample_cars_table = sample_df[["id", "manufacturer", "model", "drive", "cylinders", "transmission", "fuel"]]
sample_cars_table['drive'] = sample_cars_table['drive'].replace('other', 'gas')

sample_cars_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_cars_table['drive'] = sample_cars_table['drive'].replace('other', 'gas')


Unnamed: 0,id,manufacturer,model,drive,cylinders,transmission,fuel
0,7316070182,mini,cooper convertible,fwd,3,automatic,gas
1,7315046692,ford,focus,4wd,6,automatic,other
2,7309903977,honda,Scion iM Hatchback 4D,fwd,6,automatic,gas
3,7308144849,mercedes-benz,c-class c 300,rwd,4,automatic,gas
4,7303112586,audi,a3,fwd,4,automatic,gas
5,7315272787,chevrolet,equinox,fwd,6,automatic,gas
6,7301590585,chevrolet,ss sedan 4d,rwd,8,automatic,gas
7,7314895829,toyota,4runner 4wd,4wd,6,automatic,gas
8,7307956205,ford,f-150 xlt,rwd,8,automatic,gas
9,7313362366,audi,a6 2.0t premium plus sedan,fwd,6,automatic,gas


In [70]:
num_users = 10

user_data = {
    'user_id': [str(uuid.uuid4())[:8] for _ in range(num_users)],
    'first_name': [fake.first_name() for _ in range(num_users)],
    'last_name': [fake.last_name() for _ in range(num_users)],
    'email': [fake.email() for _ in range(num_users)],
    'date_of_birth': [fake.date_of_birth(minimum_age=18, maximum_age=60) for _ in range(num_users)],
    'password': [fake.password(length=10, special_chars=True, digits=True, upper_case=True, lower_case=True) for _ in range(num_users)]
}


sample_users_table = pd.DataFrame(user_data)

sample_users_table

Unnamed: 0,user_id,first_name,last_name,email,date_of_birth,password
0,6d8b9cc0,Monica,Allen,valeriemcguire@example.org,1982-10-05,MDs3RkVk)W
1,07484e0a,Deanna,Bennett,joseph67@example.net,1990-12-10,XgL46aRh@6
2,67032ae7,Robert,Hamilton,xavierellis@example.net,1995-07-31,qkI#QR1O%0
3,428186a6,James,Lane,douglasmichael@example.com,1985-12-12,z9NItmZ8%w
4,13f2b4b1,James,Banks,ryan39@example.com,1973-10-23,OlbCydQb%5
5,5ba415b2,Juan,White,brandoncoleman@example.org,1993-08-05,L24sAh8L@5
6,9d121582,Robert,Bass,sandra83@example.net,2001-11-08,V(96pSQj_C
7,e87e5459,Michelle,Soto,kimberly41@example.org,2000-03-28,gq0Sh!Exu+
8,0f85cc64,Ashley,Guzman,lopezwillie@example.org,1998-04-20,v5nCcpqj$@
9,a27be510,David,Perez,garywarner@example.com,1978-11-08,gE1Kd4mFV%


In [71]:
fake = Faker()

num_owners = 10

owner_data = {
    'user_id': [str(uuid.uuid4())[:8] for _ in range(num_owners)],
    'first_name': [fake.first_name() for _ in range(num_owners)],
    'last_name': [fake.last_name() for _ in range(num_owners)],
    'email': [fake.email() for _ in range(num_owners)],
    'date_of_birth': [fake.date_of_birth(minimum_age=18, maximum_age=60) for _ in range(num_owners)],
    'password': [fake.password(length=10, special_chars=True, digits=True, upper_case=True, lower_case=True) for _ in range(num_owners)],
    'status': [random.choice(['active', 'inactive', 'banned']) for _ in range(num_owners)],
    'dealer_rating': [random.randint(1, 5) for _ in range(num_owners)],
    'dealer_num_ratings': [random.randint(1, 1000) for _ in range(num_owners)]
}

sample_owners_table = pd.DataFrame(owner_data)
sample_owners_table

Unnamed: 0,user_id,first_name,last_name,email,date_of_birth,password,status,dealer_rating,dealer_num_ratings
0,505bf830,Michael,Francis,gary18@example.net,1982-10-10,95rOhQJS+k,banned,3,440
1,1a10b317,Adam,Norton,april01@example.org,1989-12-11,Up1nsWLbD(,banned,3,906
2,b734f9f5,Jeffrey,Flores,bjimenez@example.net,1979-05-24,$k6b8Ji&uI,banned,5,748
3,fc37d24d,Jorge,Kelley,fjohnson@example.net,1964-11-08,r&1pxLvhA&,inactive,1,517
4,ff473545,Hannah,Murphy,cindyortiz@example.net,1988-06-04,##7AAoP36P,inactive,5,738
5,40c7a1d7,Suzanne,Harrison,justin89@example.net,1986-06-17,2a2SxDieZ),active,1,524
6,77c7a3d5,Jason,Henry,khays@example.com,2000-01-22,)Ge#9zYber,banned,5,70
7,ad0ae349,Kelly,Fields,carol25@example.org,1996-08-22,hCB4Dfp+^(,banned,2,246
8,e3bb3259,Albert,Stuart,amandagreer@example.net,1991-12-25,mp1JT+KvO),active,1,842
9,34cdedf5,Michelle,Jenkins,ibush@example.com,1970-06-20,F8Bjhpe*!5,active,2,806


In [73]:
num_reviews = 15

car_review_phrases = [
    "The car runs smoothly and has great mileage.",
    "I love the interior design and the comfort it offers.",
    "The engine performance is exceptional!",
    "Very reliable and fuel-efficient.",
    "The car looks stylish and is very easy to handle.",
    "I've had some issues with the transmission.",
    "The car is a bit overpriced for what it offers.",
    "Excellent customer service from the dealer.",
    "I had a great buying experience and the car is fantastic.",
    "The sound system in the car is top-notch.",
    "It's a decent car but lacks advanced features.",
    "Very spacious and comfortable for long drives.",
    "The air conditioning system works perfectly.",
    "I had to take it for repairs sooner than expected.",
    "The car offers a smooth and quiet ride.",
    "The handling is responsive and precise.",
    "The fuel economy could be better.",
    "Comfortable seating for long trips.",
    "The car's safety features are impressive.",
    "Great value for money.",
    "The exterior design is eye-catching.",
    "The dashboard layout is intuitive and user-friendly.",
    "Acceleration is swift and powerful.",
    "The car is perfect for city driving.",
    "The brakes are sensitive and reliable.",
    "The headlights provide excellent visibility.",
    "The suspension absorbs bumps well.",
    "The seats are supportive and adjustable.",
    "Road noise is minimal.",
    "The car's reliability rating is high.",
    "Cargo space is ample for groceries and luggage.",
    "The steering wheel feels comfortable in hand.",
    "The infotainment system is easy to use.",
    "The car handles well in all weather conditions.",
    "The paint quality is durable and glossy.",
    "The mileage per gallon is impressive.",
    "Maintenance costs are reasonable.",
    "The seats are heated and ventilated.",
    "The navigation system is accurate and responsive.",
    "The car has good resale value.",
    "I feel safe driving this car.",
    "The climate control system is effective.",
    "The rearview camera provides a clear view.",
    "The car accelerates smoothly from a stop.",
    "The gearbox shifts gears seamlessly.",
    "The suspension offers a comfortable ride.",
    "The cabin is quiet at highway speeds.",
    "The car's warranty coverage is comprehensive.",
    "The touchscreen interface is intuitive.",
    "The car handles curves with confidence.",
    "The interior materials are high-quality.",
    "The car's technology features are impressive.",
    "The seats have plenty of legroom.",
    "The steering is precise and responsive.",
    "The car's braking system is reliable.",
    "The adaptive cruise control works well.",
    "The car's design is timeless.",
    "The engine is powerful yet fuel-efficient.",
    "The car's Bluetooth connectivity is seamless.",
    "The dashboard displays essential information clearly.",
    "The sound quality of the audio system is excellent.",
    "The car has good visibility all around.",
    "The car's headlights illuminate the road well.",
    "The seats are easy to adjust for comfort.",
    "The car feels stable at high speeds.",
    "The steering is light and easy to maneuver.",
    "The car has plenty of storage compartments.",
    "The car's handling is agile and responsive.",
    "The seats provide excellent lumbar support.",
    "The car's resale value holds up well.",
    "The interior is spacious and well-designed.",
    "The car accelerates quickly when needed.",
    "The car's fuel tank is large for long trips.",
    "The car's exterior color is vibrant.",
    "The car's dashboard is well-organized.",
    "The car's entertainment system keeps passengers entertained.",
    "The car's suspension absorbs bumps smoothly.",
    "The car's cabin is insulated from road noise.",
    "The car's climate control keeps the cabin comfortable.",
    "The car's seats are plush and comfortable.",
    "The car's engine is quiet at idle.",
    "The car's transmission shifts gears seamlessly.",
    "The car's steering is precise and responsive.",
    "The car's brakes are strong and reliable.",
    "The car's fuel efficiency is impressive for its size.",
    "The car's safety features provide peace of mind.",
    "The car's infotainment system is intuitive to use.",
    "The car's interior design is modern and attractive.",
    "The car's exterior design is sporty and eye-catching.",
    "The car's ride quality is smooth and comfortable.",
    "The car's acceleration is brisk and powerful.",
    "The car's handling is nimble and agile.",
    "The car's suspension offers a balanced ride.",
    "The car's seats are supportive and ergonomic.",
    "The car's cabin is spacious and airy.",
    "The car's visibility is excellent from all angles.",
    "The car's technology features are cutting-edge.",
    "The car's fuel economy is efficient.",
    "The car's steering wheel feels solid and well-designed.",
    "The car's dashboard layout is logical and user-friendly.",
    "The car's rear seats are comfortable for passengers.",
    "The car's trunk space is generous for luggage.",
    "The car's exterior styling is elegant and refined.",
    "The car's interior materials are high-grade and durable.",
    "The car's ride is smooth and composed.",
    "The car's acceleration is strong and responsive.",
    "The car's handling is agile and precise.",
    "The car's brakes are powerful and reliable.",
    "The car's fuel efficiency is excellent.",
    "The car's safety features are advanced and effective.",
    "The car's infotainment system is intuitive and feature-rich.",
    "The car's interior design is stylish and modern.",
    "The car's exterior design is sleek and aerodynamic.",
    "The car's suspension absorbs bumps with ease.",
    "The car's seats are comfortable for long drives.",
    "The car's steering is light and responsive.",
    "The car's cabin is quiet at highway speeds.",
    "The car's visibility is clear and unobstructed.",
    "The car's technology enhances the driving experience.",
    "The car's climate control system is efficient and effective.",
    "The car's seats provide ample support and comfort.",
    "The car's audio system delivers crisp and clear sound.",
    "The car's interior is well-insulated from outside noise.",
    "The car's transmission shifts smoothly between gears.",
    "The car's exterior color is vibrant and attractive.",
    "The car's headlights provide excellent illumination.",
    "The car's touchscreen interface is user-friendly.",
    "The car's navigation system is accurate and reliable.",
    "The car's cargo space is ample for groceries and luggage.",
    "The car's steering response is precise and predictable.",
    "The car's suspension offers a smooth and controlled ride.",
    "The car's seats are plush and supportive.",
    "The car's cabin layout is practical and ergonomic.",
    "The car's exterior lines are sleek and modern.",
    "The car's engine delivers strong performance.",
    "The car's fuel economy is impressive for its class.",
    "The car's safety ratings are top-notch.",
    "The car's technology suite is comprehensive and intuitive.",
    "The car's interior materials are luxurious and refined.",
    "The car's exterior design is bold and distinctive.",
    "The car's ride quality is smooth and comfortable.",
    "The car's acceleration is brisk and exhilarating.",
    "The car's handling is precise and agile.",
    "The car's braking system is responsive and reliable.",
    "The car's fuel efficiency is excellent for its size.",
    "The car's safety features provide peace of mind.",
    "The car's infotainment system is easy to use.",
    "The car's interior design is elegant and sophisticated.",
    "The car's exterior styling is dynamic and attractive.",
    "The car's suspension absorbs bumps well.",
    "The car's cabin is spacious and comfortable.",
    "The car's steering is light and responsive.",
    "The car's visibility is excellent from all angles.",
    "The car's technology features are user-friendly.",
    "The car's fuel economy is efficient.",
    "The car's steering wheel feels comfortable in hand.",
    "The car's dashboard layout is intuitive and clear.",
    "The car's rear seats offer ample legroom.",
    "The car's trunk provides generous cargo space.",
    "The car's exterior finish is durable and glossy.",
    "The car's ride is smooth and composed.",
    "The car's acceleration is brisk and powerful.",
    "The car's handling is precise and nimble.",
    "The car's brakes are strong and responsive.",
    "The car's fuel efficiency is impressive for its class.",
    "The car's safety features are advanced and effective.",
    "The car's infotainment system is intuitive and responsive.",
    "The car's interior design is modern and attractive.",
    "The car's exterior design is sporty and eye-catching.",
    "The car's suspension provides a comfortable ride.",
    "The car's seats are supportive and comfortable.",
    "The car's cabin is spacious and well-appointed.",
    "The car's visibility is excellent from the driver's seat.",
    "The car's technology features enhance convenience and safety.",
    "The car's climate control system maintains a comfortable cabin temperature.",
    "The car's seats are ergonomically designed for long trips.",
    "The car's audio system delivers immersive sound quality.",
    "The car's interior is insulated from external noise.",
    "The car's transmission shifts seamlessly through gears.",
    "The car's exterior color is vibrant and eye-catching.",
    "The car's headlights illuminate the road ahead effectively.",
    "The car's touchscreen interface is intuitive to navigate."
]

def generate_car_review():
    phrase = random.choice(car_review_phrases)
    additional_comment = fake.sentence()
    return f"{phrase} {additional_comment}"

review_data = {
    'review_id': [str(uuid.uuid4())[:8] for _ in range(num_reviews)],
    'author': [random.choice(sample_users_table['user_id']) for _ in range(num_reviews)],  # FK to users table
    'Listing': [random.choice(sample_listings_table['listing_id']) for _ in range(num_reviews)],  # FK to listings table
    'rating': [random.randint(1, 5) for _ in range(num_reviews)],
    'Comments': [generate_car_review() for _ in range(num_reviews)]
}

reviews_table = pd.DataFrame(review_data)
reviews_table

Unnamed: 0,review_id,author,Listing,rating,Comments
0,9b85aac3,5ba415b2,7313818002,4,Excellent customer service from the dealer. Be...
1,851f8081,07484e0a,7311973060,4,The car's reliability rating is high. Civil of...
2,bdbf69a8,0f85cc64,7315321664,5,It's a decent car but lacks advanced features....
3,5d41ca56,9d121582,7314902275,1,The car's exterior design is sporty and eye-ca...
4,64695801,428186a6,7309786052,1,The car's entertainment system keeps passenger...
5,a020d67d,0f85cc64,7306687695,1,The car accelerates smoothly from a stop. Proj...
6,5db4bf9e,13f2b4b1,7306687695,4,The car's steering response is precise and pre...
7,8bebcab8,e87e5459,7310005793,5,The car's interior design is modern and attrac...
8,1c66c820,13f2b4b1,7311973060,1,The car's interior design is modern and attrac...
9,ef9317d8,e87e5459,7314943923,4,The interior is spacious and well-designed. Ai...


In [65]:
num_rentals = 20

rental_data = {
    'rental_id': [str(uuid.uuid4())[:8] for _ in range(num_rentals)],
    'listing_id': np.random.choice(sample_listings_table['listing_id'], size=num_rentals),
    'user_id': np.random.choice(sample_users_table['user_id'], size=num_rentals),
    'rent_date': [datetime.now() - timedelta(days=random.randint(1, 365)) for _ in range(num_rentals)],
    'return_date': [None] * num_rentals, 
    'status': ['renting'] * num_rentals 
}

for i in range(num_rentals):
    if rental_data['status'][i] == 'renting':
        # between 1 and 50 days from rent_date
        rental_data['return_date'][i] = rental_data['rent_date'][i] + timedelta(days=random.randint(1, 50))
        # 'returned' if return_date is in the past
        if rental_data['return_date'][i] <= datetime.now():
            rental_data['status'][i] = 'returned'
        else:
            rental_data['return_date'][i] = None

sample_rentals_table = pd.DataFrame(rental_data)
sample_rentals_table

Unnamed: 0,rental_id,listing_id,user_id,rent_date,return_date,status
0,a1b856e6,7311973060,5dacf0cf,2024-03-02 17:23:01.093866,2024-04-11 17:23:01.093866,returned
1,d63b95cb,7314774180,473c15cd,2024-01-08 17:23:01.093866,2024-02-27 17:23:01.093866,returned
2,56f4cfa4,7309308906,a8953ea0,2023-11-11 17:23:01.093866,2023-12-19 17:23:01.093866,returned
3,b8720ed7,7314958973,5dacf0cf,2023-07-13 17:23:01.093866,2023-08-05 17:23:01.093866,returned
4,c636f4f7,7308527640,cf625100,2023-11-14 17:23:01.093866,2023-12-03 17:23:01.093866,returned
5,3fd7e9cb,7306253042,473c15cd,2024-02-24 17:23:01.093866,2024-04-09 17:23:01.093866,returned
6,a4ef48eb,7309301075,473c15cd,2023-11-24 17:23:01.093866,2023-12-10 17:23:01.093866,returned
7,41524906,7311814907,cf625100,2023-10-19 17:23:01.093866,2023-11-25 17:23:01.093866,returned
8,1ca4118a,7308527640,27bc3710,2024-05-21 17:23:01.093866,NaT,renting
9,fff40fb9,7309786052,fadec98d,2023-11-15 17:23:01.093866,2023-11-24 17:23:01.093866,returned


In [60]:
sample_listings_table = sample_df[["id", "manufacturer", "model", "region", "image_url", "description", "condition", "state", "posting_date", "mileage"]]
sample_listings_table = sample_listings_table.rename(columns={
    "id": "listing_id",
    "state": "state_id"
})


sample_listings_table['status'] = [random.choice(['active', 'inactive']) for _ in range(len(sample_listings_table))]
sample_listings_table['price'] = [random.uniform(1, 60) for _ in range(len(sample_listings_table))]
sample_listings_table['posting_date'] = pd.to_datetime(sample_listings_table['posting_date'], errors='coerce')
sample_listings_table['posting_date'] = sample_listings_table['posting_date'].apply(lambda x: x if x.year > 1900 else datetime.now())
sample_listings_table['owner_id'] = np.random.choice(sample_owners_table['user_id'], size=len(sample_listings_table))
sample_listings_table['rent_time'] = [random.randint(5, 30) for _ in range(len(sample_listings_table))]

sample_listings_table


Unnamed: 0,listing_id,manufacturer,model,region,image_url,description,condition,state_id,posting_date,mileage,status,price,owner_id,rent_time
0,7314958973,jeep,patriot latitude,maine,https://images.craigslist.org/00g0g_5LgrbjrsqF...,Jeep Patriot Latitude edition 4X4 Reliable fun...,like new,me,2024-05-07 03:10:14,110644,active,15.793378,d2e4b244,5
1,7314943923,audi,a4,"washington, DC",https://images.craigslist.org/01717_joii0xiLh3...,2015 AUDI A4 Premium Plus Offered by: DMV ...,good,dc,2023-11-14 22:23:07,155255,inactive,39.483949,2b660f55,25
2,7310005793,chevrolet,silverado 2500 ltz du,medford-ashland,https://images.craigslist.org/00G0G_h2H1LtrsWM...,Carz Planet STOCK #: 7029 💲 💲 FOR...,good,or,2024-04-28 15:37:43,128842,inactive,57.02525,9b6b2588,30
3,7308527640,chrysler,200,rochester,https://images.craigslist.org/00u0u_5Ihm5flMkK...,Car is in good shape and has been well maintai...,good,mn,2023-07-26 15:55:12,78039,active,42.884265,9b6b2588,24
4,7311814907,chrysler,pacifica,pittsburgh,https://images.craigslist.org/00B0B_iLpMIMg11A...,Krebs Chrysler Jeep Dodge address: 100 Kreb...,good,pa,2023-08-02 07:04:45,97443,active,18.423603,2b660f55,9
5,7313649821,ford,expedition 4x4,victoria,https://images.craigslist.org/00j0j_j5zw72k5LN...,"2003 Ford Expedition 4x4 203k miles , 5.4Ltr R...",good,tx,2024-06-18 16:07:38,38194,inactive,6.327017,469f8e70,22
6,7311960164,gmc,sierra duramax 2500 hd sle,wenatchee,https://images.craigslist.org/00u0u_hNgIM2933S...,2019 GMC Sierra 2500 HD SLE 4x4 **One Owner Cl...,like new,wa,2023-10-06 15:51:25,97465,inactive,6.687611,3f960a20,19
7,7314902275,ram,1500 lone star,san antonio,https://images.craigslist.org/00K0K_fbtmYnPXti...,I’m selling a clean title 2006 Dodge Ram 1500 ...,good,tx,2024-01-24 17:07:02,46474,inactive,38.027153,469f8e70,10
8,7311973060,toyota,highlander se awd gas suv,wenatchee,https://images.craigslist.org/00n0n_iTgmKhb2cq...,2017 Toyota Highlander SE AWD **Clean Carfax O...,excellent,wa,2024-04-01 12:40:45,41087,inactive,6.714604,d2e4b244,8
9,7314774180,hyundai,sonata,"kansas city, MO",https://images.craigslist.org/00A0A_fV2n4PpQK6...,"2011 Hyundai Sonata GLS -- $9,497 ☎ Call...",excellent,ks,2024-04-12 20:53:35,194826,active,39.327006,9b6b2588,14


In [42]:
state_data = {
    'state_id': ['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
                   'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma',
                   'mi', 'mn', 'ms', 'mo', 'mt', 'nc', 'ne', 'nv', 'nj', 'nm', 'ny',
                   'nh', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
                   'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'],
    'state_name': ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'District of Columbia', 
                   'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 
                   'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 
                   'Montana', 'North Carolina', 'Nebraska', 'Nevada', 'New Jersey', 'New Mexico', 'New York', 'New Hampshire', 
                   'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 
                   'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'],
    'time_zone': ['Central', 'Alaska', 'Mountain', 'Central', 'Pacific', 'Mountain', 'Eastern', 'Eastern', 'Eastern', 'Eastern', 
                  'Eastern', 'Hawaii-Aleutian', 'Mountain', 'Central', 'Eastern', 'Central', 'Central', 'Eastern', 'Central', 
                  'Eastern', 'Eastern', 'Eastern', 'Eastern', 'Central', 'Central', 'Central', 'Mountain', 'Eastern', 'Central', 
                  'Pacific', 'Eastern', 'Mountain', 'Eastern', 'Eastern', 'Central', 'Eastern', 'Central', 'Pacific', 'Eastern', 
                  'Eastern', 'Eastern', 'Central', 'Central', 'Central', 'Mountain', 'Eastern', 'Eastern', 'Pacific', 'Eastern', 
                  'Central', 'Mountain']
}

sample_states_table = pd.DataFrame(state_data)
sample_states_table

Unnamed: 0,state_id,state_name,time_zone
0,al,Alabama,Central
1,ak,Alaska,Alaska
2,az,Arizona,Mountain
3,ar,Arkansas,Central
4,ca,California,Pacific
5,co,Colorado,Mountain
6,ct,Connecticut,Eastern
7,dc,District of Columbia,Eastern
8,de,Delaware,Eastern
9,fl,Florida,Eastern


In [43]:
num_favorites = 20
favorite_data = {
    'id': [random.choice(sample_cars_table['id']) for _ in range(num_favorites)],
    'user_id': [random.choice(sample_users_table['user_id']) for _ in range(num_favorites)]
}

sample_favorites_table = pd.DataFrame(favorite_data)

sample_favorites_table

Unnamed: 0,id,user_id
0,7308527640,27bc3710
1,7306253042,54b89153
2,7311814907,54b89153
3,7309301075,a8953ea0
4,7313649821,cf625100
5,7314774180,4a71d9d6
6,7313818002,27bc3710
7,7315096465,27bc3710
8,7313649821,8e6ce405
9,7314902275,8e6ce405


In [24]:
# save da files
sample_cars_table.to_csv('sample_cars_table.csv', index=False)
sample_colors_table.to_csv('sample_colors_table.csv', index=False)
sample_users_table.to_csv('sample_users_table.csv', index=False)
sample_rentals_table.to_csv('sample_rentals_table.csv', index=False)
sample_listings_table.to_csv('sample_listings_table.csv', index=False)
sample_favorites_table.to_csv('sample_favorites_table.csv', index=False)
sample_states_table.to_csv('sample_states_table.csv', index=False)

### Data Summaries

In [28]:
lengths = []
for i in (df["url"] + df["image_url"]):
    lengths.append(len(str(i)))

max(lengths)

169

In [15]:
# lat and long
max_lat = max(df["lat"].unique())
min_lat = min(df["lat"].unique())

max_long = max(df["long"].unique())
min_long = min(df["long"].unique())

print("lat: [", min_lat, ",", max_lat, "] long: [", min_long, ",", max_long, "]")

lat: [ -84.122245 , 82.390818 ] long: [ -159.827728 , 173.885502 ]


In [19]:
df["state"].dropna().unique()

array(['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
       'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma',
       'mi', 'mn', 'ms', 'mo', 'mt', 'nc', 'ne', 'nv', 'nj', 'nm', 'ny',
       'nh', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
       'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'], dtype=object)

In [20]:
df["condition"].dropna().unique()

array(['good', 'excellent', 'fair', 'like new', 'new', 'salvage'],
      dtype=object)

In [23]:
user_favorites = favorites_table[favorites_table['userID'] == 'user_25764002']
user_favorites

Unnamed: 0,favoriteID,id,userID
2,ee48c532-74cb-441b-866e-bc80d93b2983,7305679700,user_25764002
62,8bc0a3ef-2b21-4166-a4f3-3c71340284a1,7315916207,user_25764002
432,70b71a2d-2668-4515-858b-43cb648357db,7316856596,user_25764002
519,f1b6b231-3410-4140-aad3-2087e9e2f1a9,7313105260,user_25764002


In [21]:
len(df['manufacturer'].unique())

42

In [19]:
df['model'].unique()

array(['sierra 1500 crew cab slt', 'silverado 1500',
       'silverado 1500 crew', ..., 'gand wagoneer', '96 Suburban',
       'Paige Glenbrook Touring'], dtype=object)

In [20]:
len(df['model'].unique())

29667