# Production Dataset

In [1]:
import pandas as pd
import numpy as np
import uuid
import random
from faker import Faker
import string
from datetime import datetime, timedelta

In [2]:
df = pd.read_csv("vehicles.csv")

In [3]:
def fill_na(column):
    actual = column.dropna().tolist()
    return column.apply(lambda x: x if pd.notna(x) else random.choice(actual))

def fill_na_vin(column):
    actual = column.dropna().tolist()
    vin_chars = string.ascii_uppercase.replace('I', '').replace('O', '').replace('Q', '') + string.digits

    def generate_random_vin():
        return ''.join(random.choices(vin_chars, k=17))
    
    return column.apply(lambda x: x if pd.notna(x) else generate_random_vin())

def generate_random_dates(n, start_date, end_date):
    start_u = start_date.value // 10**9
    end_u = end_date.value // 10**9
    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

In [4]:
df = df[27:]
df = df.reset_index().drop(["county", "size", "index"], axis=1)

In [5]:
df['drive'] = fill_na(df['drive'])
df['cylinders'] = fill_na(df['cylinders'])
df['paint_color'] = fill_na(df['paint_color'])
df['model'] = fill_na(df['model'])
df['manufacturer'] = fill_na(df['manufacturer'])
df['condition'] = fill_na(df['condition'])
df['fuel'] = fill_na(df['fuel'])
df['type'] = fill_na(df['type'])
df['VIN'] = fill_na_vin(df['VIN'])

avg_price = df["price"].mean()
df["price"] = (df["price"] / avg_price) * 200

df['cylinders'] = df['cylinders'].str.extract('(\d+)').astype(float)
df['cylinders'].fillna(df['cylinders'].dropna().sample(n=1).values[0], inplace=True)
df['cylinders'] = df['cylinders'].astype(int)


df['paint_color'] = df['paint_color'].replace('custom', 'white')
df['transmission'] = df['transmission'].replace('other', 'automatic')

df = df.rename(columns={
    "VIN": "vin"
})

end_date = pd.to_datetime('now')
start_date = end_date - pd.DateOffset(years=1)
df['posting_date'] = generate_random_dates(len(df), start_date, end_date)

df

  result, tz_parsed = tslib.array_to_datetime(


Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,vin,drive,type,paint_color,image_url,description,state,lat,long,posting_date
0,7316814884,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,89.331452,2014.0,gmc,sierra 1500 crew cab slt,good,8,...,3GTP1VEC4EG551563,4wd,pickup,white,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,al,32.590000,-85.480000,2023-07-28 16:49:09
1,7316814758,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,60.077330,2010.0,chevrolet,silverado 1500,good,8,...,1GCSCSE06AZ123805,rwd,pickup,blue,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,al,32.590000,-85.480000,2024-04-02 13:25:13
2,7316814989,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,105.288246,2020.0,chevrolet,silverado 1500 crew,good,8,...,3GCPWCED5LG130317,rwd,pickup,red,https://images.craigslist.org/01212_jjirIWa0y0...,Carvana is the safer way to buy a car During t...,al,32.590000,-85.480000,2024-03-23 01:04:24
3,7316743432,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,82.416841,2017.0,toyota,tundra double cab sr,good,8,...,5TFRM5F17HX120972,4wd,pickup,red,https://images.craigslist.org/00x0x_1y9kIOzGCF...,Carvana is the safer way to buy a car During t...,al,32.590000,-85.480000,2024-04-02 12:42:57
4,7316356412,https://auburn.craigslist.org/cto/d/auburn-uni...,auburn,https://auburn.craigslist.org,39.891985,2013.0,ford,f-150 xlt,excellent,6,...,506XWT6HAFWG1BAT0,rwd,truck,black,https://images.craigslist.org/00404_l4loxHvdQe...,2013 F-150 XLT V6 4 Door. Good condition. Leve...,al,32.592000,-85.518900,2023-07-20 04:22:18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426848,7301591192,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://wyoming.craigslist.org,62.736795,2019.0,nissan,maxima s sedan 4d,good,6,...,1N4AA6AV6KC367801,fwd,sedan,black,https://images.craigslist.org/00o0o_iiraFnHg8q...,Carvana is the safer way to buy a car During t...,wy,33.786500,-84.445400,2024-04-22 13:43:38
426849,7301591187,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://wyoming.craigslist.org,81.353055,2020.0,volvo,s60 t5 momentum sedan 4d,good,4,...,7JR102FKXLG042696,fwd,sedan,red,https://images.craigslist.org/00x0x_15sbgnxCIS...,Carvana is the safer way to buy a car During t...,wy,33.786500,-84.445400,2024-03-30 19:26:23
426850,7301591147,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://wyoming.craigslist.org,93.054704,2020.0,cadillac,xt4 sport suv 4d,good,4,...,1GYFZFR46LF088296,fwd,hatchback,white,https://images.craigslist.org/00L0L_farM7bxnxR...,Carvana is the safer way to buy a car During t...,wy,33.779214,-84.411811,2024-06-09 23:37:38
426851,7301591140,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://wyoming.craigslist.org,77.097910,2018.0,lexus,es 350 sedan 4d,good,6,...,58ABK1GG4JU103853,fwd,sedan,silver,https://images.craigslist.org/00z0z_bKnIVGLkDT...,Carvana is the safer way to buy a car During t...,wy,33.786500,-84.445400,2024-01-19 16:15:09


In [6]:
df.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'vin', 'drive', 'type', 'paint_color', 'image_url',
       'description', 'state', 'lat', 'long', 'posting_date'],
      dtype='object')

In [7]:
unique_cars = df[['manufacturer', 'model']].drop_duplicates().reset_index(drop=True)
unique_cars['car_id'] = range(1, len(unique_cars) + 1)

cars_table = df[["id", "manufacturer", "model", "drive", "cylinders", "transmission", "paint_color"]]

cars_table = cars_table.merge(unique_cars, on=['manufacturer', 'model'], how='left').drop_duplicates().reset_index(drop=True)

cars_table

Unnamed: 0,id,manufacturer,model,drive,cylinders,transmission,paint_color,car_id
0,7316814884,gmc,sierra 1500 crew cab slt,4wd,8,automatic,white,1
1,7316814758,chevrolet,silverado 1500,rwd,8,automatic,blue,2
2,7316814989,chevrolet,silverado 1500 crew,rwd,8,automatic,red,3
3,7316743432,toyota,tundra double cab sr,4wd,8,automatic,red,4
4,7316356412,ford,f-150 xlt,rwd,6,automatic,black,5
...,...,...,...,...,...,...,...,...
426848,7301591192,nissan,maxima s sedan 4d,fwd,6,automatic,black,1100
426849,7301591187,volvo,s60 t5 momentum sedan 4d,fwd,4,automatic,red,56
426850,7301591147,cadillac,xt4 sport suv 4d,fwd,4,automatic,white,28
426851,7301591140,lexus,es 350 sedan 4d,fwd,6,automatic,silver,1292


In [8]:
colors = df["paint_color"].dropna().unique()
colors_table = pd.DataFrame(colors, columns=["color"])

hex_dict = {
    'black': '#000000',
    'blue': '#0000FF',
    'brown': '#A52A2A',
    'green': '#008000',
    'grey': '#808080',
    'orange': '#FFA500',
    'purple': '#800080',
    'red': '#FF0000',
    'silver': '#C0C0C0',
    'white': '#FFFFFF',
    'yellow': '#FFFF00',
}

colors_table['hex'] = colors_table['color'].map(hex_dict)

colors_table

Unnamed: 0,color,hex
0,white,#FFFFFF
1,blue,#0000FF
2,red,#FF0000
3,black,#000000
4,silver,#C0C0C0
5,grey,#808080
6,brown,#A52A2A
7,green,#008000
8,yellow,#FFFF00
9,orange,#FFA500


In [9]:
fake = Faker()

num_users = 500

user_data = {
    'user_id': random.sample(range(10000000, 100000000), num_users), # 8 digit user_id
    'first_name': [fake.first_name() for _ in range(num_users)],
    'last_name': [fake.last_name() for _ in range(num_users)],
    'email': [fake.email() for _ in range(num_users)],
    'date_of_birth': [fake.date_of_birth(minimum_age=18, maximum_age=90) for _ in range(num_users)],
    'password': [fake.password(length=10, special_chars=True, digits=True, upper_case=True, lower_case=True) for _ in range(num_users)]
}

users_table = pd.DataFrame(user_data)
users_table

Unnamed: 0,user_id,first_name,last_name,email,date_of_birth,password
0,27212364,Brittany,Murphy,maria76@example.net,1984-09-15,%QSqQ5nq(7
1,32317483,Brandon,Ayala,vpace@example.com,1969-11-14,O!9O4qF8(I
2,34518768,Alexandra,Parker,jacqueline86@example.net,1951-09-05,r238YEmZ*W
3,73347600,Jessica,Sandoval,mgardner@example.org,1961-01-05,w#z4NzZm3+
4,19650163,Latoya,Neal,cooperstephanie@example.net,1941-12-07,VolIKelE^3
...,...,...,...,...,...,...
495,31874198,William,Thomas,robertsonjeremiah@example.org,1956-06-09,%gP3NJWz%L
496,75466571,Adam,Gibbs,brandonjones@example.com,1945-04-18,k3iPkqpe_5
497,69531356,Kristy,Gray,victoranderson@example.org,1992-07-28,C0KzPRlV+(
498,56385499,Karen,Williams,jonathan29@example.net,1949-07-15,aIj87hwiE*


In [10]:
num_rentals = 1000

rental_data = {
    'rental_id': random.sample(range(1000000000, 10000000000), num_rentals), # 10 digit rental_id
    'car_id': [random.choice(cars_table['car_id']) for _ in range(num_rentals)],
    'user_id': [random.choice(users_table['user_id']) for _ in range(num_rentals)],
    'rent_date': [fake.date_this_year(before_today=True, after_today=False) for _ in range(num_rentals)],
}

num_active = num_rentals // 2
num_inactive = num_rentals - num_active

# active rentals
active_rent_dates = [fake.date_this_year(before_today=True, after_today=False) for _ in range(num_active)]
active_return_dates = [(pd.to_datetime(rent_date) + pd.to_timedelta(random.randint(1, 50), unit='d')).strftime('%Y-%m-%d')
                       for rent_date in active_rent_dates]

# inactive rentals
start_date = datetime.now() - timedelta(days=365 * 10)
inactive_rent_dates = [fake.date_between_dates(date_start=start_date, date_end=datetime.now() - timedelta(days=50))
                       for _ in range(num_inactive)]
inactive_return_dates = [(pd.to_datetime(rent_date) + pd.to_timedelta(random.randint(1, 50), unit='d')).strftime('%Y-%m-%d')
                         for rent_date in inactive_rent_dates]

rental_data['rent_date'] = active_rent_dates + inactive_rent_dates
rental_data['return_date'] = active_return_dates + inactive_return_dates
rental_data['status'] = ['active'] * num_active + ['inactive'] * num_inactive
rentals_table = pd.DataFrame(rental_data)
rentals_table = rentals_table.sample(frac=1).reset_index(drop=True)

rentals_table

Unnamed: 0,rental_id,car_id,user_id,rent_date,return_date,status
0,1168937878,21707,88662574,2024-05-03,2024-05-04,active
1,1717525259,4218,45634953,2024-03-29,2024-04-07,active
2,2830163511,137,56303852,2024-02-05,2024-03-15,active
3,7304017928,11025,28278749,2017-04-09,2017-05-21,inactive
4,3585585996,2,32047348,2024-04-11,2024-05-29,active
...,...,...,...,...,...,...
995,2370153063,3312,46720321,2024-06-06,2024-07-22,active
996,6770061771,157,26245655,2024-01-15,2024-01-22,active
997,2019964361,2900,33218988,2022-01-17,2022-02-08,inactive
998,1698727538,227,88662574,2023-06-05,2023-07-13,inactive


In [12]:
listings_table = df[["id", "url", "region", "image_url", "description", "condition", "vin", "lat", "long", "state", "price", "posting_date"]]
listings_table["car_id"] = cars_table['car_id']

listings_table = listings_table.rename(columns={
    "url": "listing_url",
    "lat": "lat_id",
    "long": "long_id",
    "state": "state_id"
})

listings_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_table["car_id"] = cars_table['car_id']


Unnamed: 0,id,listing_url,region,image_url,description,condition,vin,lat_id,long_id,state_id,price,posting_date,car_id
0,7316814884,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,good,3GTP1VEC4EG551563,32.590000,-85.480000,al,89.331452,2023-07-28 16:49:09,1
1,7316814758,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,good,1GCSCSE06AZ123805,32.590000,-85.480000,al,60.077330,2024-04-02 13:25:13,2
2,7316814989,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://images.craigslist.org/01212_jjirIWa0y0...,Carvana is the safer way to buy a car During t...,good,3GCPWCED5LG130317,32.590000,-85.480000,al,105.288246,2024-03-23 01:04:24,3
3,7316743432,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://images.craigslist.org/00x0x_1y9kIOzGCF...,Carvana is the safer way to buy a car During t...,good,5TFRM5F17HX120972,32.590000,-85.480000,al,82.416841,2024-04-02 12:42:57,4
4,7316356412,https://auburn.craigslist.org/cto/d/auburn-uni...,auburn,https://images.craigslist.org/00404_l4loxHvdQe...,2013 F-150 XLT V6 4 Door. Good condition. Leve...,excellent,506XWT6HAFWG1BAT0,32.592000,-85.518900,al,39.891985,2023-07-20 04:22:18,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
426848,7301591192,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://images.craigslist.org/00o0o_iiraFnHg8q...,Carvana is the safer way to buy a car During t...,good,1N4AA6AV6KC367801,33.786500,-84.445400,wy,62.736795,2024-04-22 13:43:38,1100
426849,7301591187,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://images.craigslist.org/00x0x_15sbgnxCIS...,Carvana is the safer way to buy a car During t...,good,7JR102FKXLG042696,33.786500,-84.445400,wy,81.353055,2024-03-30 19:26:23,56
426850,7301591147,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://images.craigslist.org/00L0L_farM7bxnxR...,Carvana is the safer way to buy a car During t...,good,1GYFZFR46LF088296,33.779214,-84.411811,wy,93.054704,2024-06-09 23:37:38,28
426851,7301591140,https://wyoming.craigslist.org/ctd/d/atlanta-2...,wyoming,https://images.craigslist.org/00z0z_bKnIVGLkDT...,Carvana is the safer way to buy a car During t...,good,58ABK1GG4JU103853,33.786500,-84.445400,wy,77.097910,2024-01-19 16:15:09,1292


In [13]:
import pandas as pd

state_data = {
    'state_id': ['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
                   'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma',
                   'mi', 'mn', 'ms', 'mo', 'mt', 'nc', 'ne', 'nv', 'nj', 'nm', 'ny',
                   'nh', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
                   'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'],
    'state_name': ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'District of Columbia', 
                   'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 
                   'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 
                   'Montana', 'North Carolina', 'Nebraska', 'Nevada', 'New Jersey', 'New Mexico', 'New York', 'New Hampshire', 
                   'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 
                   'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'],
    'time_zone': ['Central', 'Alaska', 'Mountain', 'Central', 'Pacific', 'Mountain', 'Eastern', 'Eastern', 'Eastern', 'Eastern', 
                  'Eastern', 'Hawaii-Aleutian', 'Mountain', 'Central', 'Eastern', 'Central', 'Central', 'Eastern', 'Central', 
                  'Eastern', 'Eastern', 'Eastern', 'Eastern', 'Central', 'Central', 'Central', 'Mountain', 'Eastern', 'Central', 
                  'Pacific', 'Eastern', 'Mountain', 'Eastern', 'Eastern', 'Central', 'Eastern', 'Central', 'Pacific', 'Eastern', 
                  'Eastern', 'Eastern', 'Central', 'Central', 'Central', 'Mountain', 'Eastern', 'Eastern', 'Pacific', 'Eastern', 
                  'Central', 'Mountain']
}

states_table = pd.DataFrame(state_data)
states_table

Unnamed: 0,state_id,state_name,time_zone
0,al,Alabama,Central
1,ak,Alaska,Alaska
2,az,Arizona,Mountain
3,ar,Arkansas,Central
4,ca,California,Pacific
5,co,Colorado,Mountain
6,ct,Connecticut,Eastern
7,dc,District of Columbia,Eastern
8,de,Delaware,Eastern
9,fl,Florida,Eastern


In [14]:
num_favorites = 750
favorite_data = {
    'id': [random.choice(cars_table['id']) for _ in range(num_favorites)],
    'user_id': [random.choice(users_table['user_id']) for _ in range(num_favorites)]
}

favorites_table = pd.DataFrame(favorite_data)
favorites_table['car_id'] = favorites_table['id'].map(cars_table.set_index('id')['car_id'].to_dict())

favorites_table

Unnamed: 0,id,user_id,car_id
0,7310314165,68558566,566
1,7315137757,80476560,239
2,7316138479,30931024,2846
3,7302194328,14129170,2
4,7315167224,39118733,1304
...,...,...,...
745,7309843508,31837777,656
746,7315676999,52395548,192
747,7314837762,40218374,33
748,7315150201,34887668,295


In [15]:
# # save da files
# cars_table.to_csv('cars_table.csv', index=False)
# colors_table.to_csv('colors_table.csv', index=False)
# users_table.to_csv('users_table.csv', index=False)
# rentals_table.to_csv('rentals_table.csv', index=False)
# listings_table.to_csv('listings_table.csv', index=False)
# favorites_table.to_csv('favorites_table.csv', index=False)
# states_table.to_csv('states_table.csv', index=False)

# Sample Dataset

In [16]:
sample_df = df.sample(n=50).reset_index(drop=True)
sample_df

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,vin,drive,type,paint_color,image_url,description,state,lat,long,posting_date
0,7316737397,https://salina.craigslist.org/ctd/d/salina-201...,salina,https://salina.craigslist.org,77.09791,2017.0,bmw,4 series 440i xdrive gran,good,6,...,WBA4E5C56HG189275,fwd,coupe,grey,https://images.craigslist.org/00606_eTQPhZGTkM...,Carvana is the safer way to buy a car During t...,ks,38.81,-97.61,2024-05-01 20:52:08
1,7315303863,https://nashville.craigslist.org/ctd/d/murfree...,nashville,https://nashville.craigslist.org,239.218938,2020.0,mercedes-benz,s-class,excellent,4,...,WDDUG8DB7LA521485,rwd,sedan,black,https://images.craigslist.org/00n0n_ktxXjzsmY5...,2020 Mercedes-Benz S560 PREMIUM RWD W/NAV ...,tn,35.840175,-86.387603,2024-06-05 05:16:46
2,7306160055,https://knoxville.craigslist.org/ctd/d/knoxvil...,knoxville,https://knoxville.craigslist.org,171.774888,2019.0,audi,q8 premium sport utility 4d,good,6,...,WA1AVAF10KD039141,rwd,other,blue,https://images.craigslist.org/00e0e_c50g4fIkpN...,Carvana is the safer way to buy a car During t...,tn,35.97,-83.94,2023-06-23 15:57:20
3,7310276447,https://bemidji.craigslist.org/ctd/d/perham-20...,bemidji,https://bemidji.craigslist.org,67.151508,2017.0,ford,f-150,like new,8,...,1FTFX1EF7HKC12788,4wd,truck,white,https://images.craigslist.org/00t0t_5Vkb0qv8r8...,"2017 Ford F-150 XLT -- $25,250 ☎ Call: (...",mn,46.590185,-95.57469,2023-12-08 09:29:49
4,7303685479,https://eauclaire.craigslist.org/ctd/d/wiscons...,eau claire,https://eauclaire.craigslist.org,34.49327,2015.0,chevrolet,equinox,excellent,4,...,T435XZZHZ62NGMX3L,4wd,SUV,white,https://images.craigslist.org/00H0H_2c6SA6BmDW...,2015 Chevrolet Equinox LS AWD 2.4 4 Cylinder ...,wi,44.410164,-89.882412,2024-05-06 17:32:51
5,7311927824,https://olympic.craigslist.org/ctd/d/puyallup-...,olympic peninsula,https://olympic.craigslist.org,178.181541,2020.0,ford,expedition max,good,6,...,1FMJK1JT2LEA38166,4wd,SUV,black,https://images.craigslist.org/00T0T_1nXrsOqH1Q...,Joydrive: 253-201-0804 https://joydrive.com/...,wa,47.1991,-122.3151,2023-12-29 15:02:07
6,7310325874,https://porthuron.craigslist.org/ctd/d/point-e...,port huron,https://porthuron.craigslist.org,105.288246,2019.0,acura,mdx sh-awd w/technology,good,6,...,5J8YD4H50KL008682,4wd,other,white,https://images.craigslist.org/00h0h_dGAvgLShy8...,Carvana is the safer way to buy a car During t...,mi,43.01717,-82.45099,2024-04-23 05:58:14
7,7316146250,https://washingtondc.craigslist.org/nva/ctd/d/...,"washington, DC",https://washingtondc.craigslist.org,97.309849,2020.0,toyota,sienna xle minivan 4d,good,6,...,5TDYZ3DC8LS062357,fwd,van,silver,https://images.craigslist.org/00a0a_hbBcOXHYbg...,Carvana is the safer way to buy a car During t...,dc,38.89,-77.03,2023-06-29 12:18:14
8,7314542698,https://cnj.craigslist.org/ctd/d/trenton-2017-...,central NJ,https://cnj.craigslist.org,50.503253,2017.0,volkswagen,e-golf sel premium,good,6,...,WVWPR7AU4HW950160,rwd,sedan,blue,https://images.craigslist.org/00H0H_3V3uEl06cB...,Carvana is the safer way to buy a car During t...,nj,40.22,-74.76,2024-04-03 12:34:10
9,7308204040,https://bham.craigslist.org/ctd/d/clanton-2019...,birmingham,https://bham.craigslist.org,121.088132,2019.0,toyota,tacoma,excellent,6,...,5TFCZ5AN0KX185798,4wd,hatchback,black,https://images.craigslist.org/00606_2RL4eJFKt2...,phone: ☎ (205) 862-8003 text: Text ...,al,32.9229,-86.545,2023-09-15 22:53:48


In [17]:
unique_cars = sample_df[['manufacturer', 'model']].drop_duplicates().reset_index(drop=True)
unique_cars['car_id'] = range(1, len(unique_cars) + 1)

sample_cars_table = sample_df[["id", "manufacturer", "model", "drive", "cylinders", "transmission", "paint_color"]]
sample_cars_table = sample_cars_table.merge(unique_cars, on=['manufacturer', 'model'], how='left').drop_duplicates().reset_index(drop=True)
sample_cars_table

Unnamed: 0,id,manufacturer,model,drive,cylinders,transmission,paint_color,car_id
0,7316737397,bmw,4 series 440i xdrive gran,fwd,6,automatic,grey,1
1,7315303863,mercedes-benz,s-class,rwd,4,automatic,black,2
2,7306160055,audi,q8 premium sport utility 4d,rwd,6,automatic,blue,3
3,7310276447,ford,f-150,4wd,8,automatic,white,4
4,7303685479,chevrolet,equinox,4wd,4,automatic,white,5
5,7311927824,ford,expedition max,4wd,6,automatic,black,6
6,7310325874,acura,mdx sh-awd w/technology,4wd,6,automatic,white,7
7,7316146250,toyota,sienna xle minivan 4d,fwd,6,automatic,silver,8
8,7314542698,volkswagen,e-golf sel premium,rwd,6,automatic,blue,9
9,7308204040,toyota,tacoma,4wd,6,automatic,black,10


In [18]:
colors = df["paint_color"].dropna().unique()
sample_colors_table = pd.DataFrame(colors, columns=["color"])

hex_dict = {
    'black': '#000000',
    'blue': '#0000FF',
    'brown': '#A52A2A',
    'green': '#008000',
    'grey': '#808080',
    'orange': '#FFA500',
    'purple': '#800080',
    'red': '#FF0000',
    'silver': '#C0C0C0',
    'white': '#FFFFFF',
    'yellow': '#FFFF00',
}

sample_colors_table['hex'] = colors_table['color'].map(hex_dict)

sample_colors_table

Unnamed: 0,color,hex
0,white,#FFFFFF
1,blue,#0000FF
2,red,#FF0000
3,black,#000000
4,silver,#C0C0C0
5,grey,#808080
6,brown,#A52A2A
7,green,#008000
8,yellow,#FFFF00
9,orange,#FFA500


In [19]:
fake = Faker()

num_users = 10

user_data = {
    'user_id': random.sample(range(10000000, 100000000), num_users), # 8 digit user_id
    'first_name': [fake.first_name() for _ in range(num_users)],
    'last_name': [fake.last_name() for _ in range(num_users)],
    'email': [fake.email() for _ in range(num_users)],
    'date_of_birth': [fake.date_of_birth(minimum_age=18, maximum_age=90) for _ in range(num_users)],
    'password': [fake.password(length=10, special_chars=True, digits=True, upper_case=True, lower_case=True) for _ in range(num_users)]
}

sample_users_table = pd.DataFrame(user_data)
sample_users_table

Unnamed: 0,user_id,first_name,last_name,email,date_of_birth,password
0,57354515,Alexander,Diaz,madisontaylor@example.org,1948-04-10,(0(5SWs^+A
1,35781206,Julia,Duarte,zgarcia@example.org,1958-05-28,T76I&Dfos&
2,85707167,Aaron,Ramirez,jeremy04@example.net,1960-09-02,^bYKcWp*_4
3,73461497,Leah,Jackson,sarahart@example.org,1992-06-07,(*!21XIfuW
4,37571958,Heather,Robertson,robertjohnson@example.net,2002-09-12,@+*yiCEn2Z
5,93596265,Bobby,Brown,amber07@example.org,1946-12-31,zW7GxF)m(M
6,43481083,Sarah,Chase,tamarapalmer@example.net,1957-12-06,fF!F0rwc5!
7,53734996,Daniel,Lewis,connie72@example.com,1961-03-26,O*o13NIaSo
8,74012686,Roberto,Cherry,austinchurch@example.net,1942-07-19,!2jKS%*lqG
9,57364420,Scott,Smith,weissphilip@example.com,1985-07-31,#Q92U7him2


In [20]:
num_rentals = 50

rental_data = {
    'rental_id': random.sample(range(1000000000, 10000000000), num_rentals), # 10 digit rental_id
    'car_id': [random.choice(sample_cars_table['car_id']) for _ in range(num_rentals)],
    'user_id': [random.choice(sample_users_table['user_id']) for _ in range(num_rentals)],
    'rent_date': [fake.date_this_year(before_today=True, after_today=False) for _ in range(num_rentals)],
}

num_active = num_rentals // 2
num_inactive = num_rentals - num_active

# active rentals
active_rent_dates = [fake.date_this_year(before_today=True, after_today=False) for _ in range(num_active)]
active_return_dates = [(pd.to_datetime(rent_date) + pd.to_timedelta(random.randint(1, 50), unit='d')).strftime('%Y-%m-%d')
                       for rent_date in active_rent_dates]

# inactive rentals
start_date = datetime.now() - timedelta(days=365 * 10)
inactive_rent_dates = [fake.date_between_dates(date_start=start_date, date_end=datetime.now() - timedelta(days=50))
                       for _ in range(num_inactive)]
inactive_return_dates = [(pd.to_datetime(rent_date) + pd.to_timedelta(random.randint(1, 50), unit='d')).strftime('%Y-%m-%d')
                         for rent_date in inactive_rent_dates]

rental_data['rent_date'] = active_rent_dates + inactive_rent_dates
rental_data['return_date'] = active_return_dates + inactive_return_dates
rental_data['status'] = ['active'] * num_active + ['inactive'] * num_inactive
sample_rentals_table = pd.DataFrame(rental_data)
sample_rentals_table = sample_rentals_table.sample(frac=1).reset_index(drop=True)

sample_rentals_table

Unnamed: 0,rental_id,car_id,user_id,rent_date,return_date,status
0,8792766032,20,57354515,2024-03-26,2024-04-25,active
1,5311311850,25,74012686,2024-05-26,2024-06-07,active
2,9749706953,2,57364420,2024-06-09,2024-07-16,active
3,2671282927,4,57354515,2024-02-06,2024-03-09,active
4,4317848457,48,43481083,2019-12-29,2020-02-13,inactive
5,6532143052,46,85707167,2020-01-20,2020-02-28,inactive
6,3999955219,29,57364420,2021-07-12,2021-08-26,inactive
7,5088241972,26,57364420,2019-12-11,2020-01-14,inactive
8,2561594081,38,85707167,2014-11-05,2014-11-28,inactive
9,5882296605,19,53734996,2019-08-13,2019-09-12,inactive


In [25]:
sample_listings_table = sample_df[["id", "url", "region", "image_url", "description", "condition", "vin", "lat", "long", "state", "price", "posting_date"]]
sample_listings_table["car_id"] = sample_cars_table['car_id']

sample_listings_table = sample_listings_table.rename(columns={
    "url": "listing_url",
    "lat": "lat_id",
    "long": "long_id",
    "state": "state_id"
})

sample_listings_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_listings_table["car_id"] = sample_cars_table['car_id']


Unnamed: 0,id,listing_url,region,image_url,description,condition,vin,lat_id,long_id,state_id,price,posting_date,car_id
0,7316737397,https://salina.craigslist.org/ctd/d/salina-201...,salina,https://images.craigslist.org/00606_eTQPhZGTkM...,Carvana is the safer way to buy a car During t...,good,WBA4E5C56HG189275,38.81,-97.61,ks,77.09791,2024-05-01 20:52:08,1
1,7315303863,https://nashville.craigslist.org/ctd/d/murfree...,nashville,https://images.craigslist.org/00n0n_ktxXjzsmY5...,2020 Mercedes-Benz S560 PREMIUM RWD W/NAV ...,excellent,WDDUG8DB7LA521485,35.840175,-86.387603,tn,239.218938,2024-06-05 05:16:46,2
2,7306160055,https://knoxville.craigslist.org/ctd/d/knoxvil...,knoxville,https://images.craigslist.org/00e0e_c50g4fIkpN...,Carvana is the safer way to buy a car During t...,good,WA1AVAF10KD039141,35.97,-83.94,tn,171.774888,2023-06-23 15:57:20,3
3,7310276447,https://bemidji.craigslist.org/ctd/d/perham-20...,bemidji,https://images.craigslist.org/00t0t_5Vkb0qv8r8...,"2017 Ford F-150 XLT -- $25,250 ☎ Call: (...",like new,1FTFX1EF7HKC12788,46.590185,-95.57469,mn,67.151508,2023-12-08 09:29:49,4
4,7303685479,https://eauclaire.craigslist.org/ctd/d/wiscons...,eau claire,https://images.craigslist.org/00H0H_2c6SA6BmDW...,2015 Chevrolet Equinox LS AWD 2.4 4 Cylinder ...,excellent,T435XZZHZ62NGMX3L,44.410164,-89.882412,wi,34.49327,2024-05-06 17:32:51,5
5,7311927824,https://olympic.craigslist.org/ctd/d/puyallup-...,olympic peninsula,https://images.craigslist.org/00T0T_1nXrsOqH1Q...,Joydrive: 253-201-0804 https://joydrive.com/...,good,1FMJK1JT2LEA38166,47.1991,-122.3151,wa,178.181541,2023-12-29 15:02:07,6
6,7310325874,https://porthuron.craigslist.org/ctd/d/point-e...,port huron,https://images.craigslist.org/00h0h_dGAvgLShy8...,Carvana is the safer way to buy a car During t...,good,5J8YD4H50KL008682,43.01717,-82.45099,mi,105.288246,2024-04-23 05:58:14,7
7,7316146250,https://washingtondc.craigslist.org/nva/ctd/d/...,"washington, DC",https://images.craigslist.org/00a0a_hbBcOXHYbg...,Carvana is the safer way to buy a car During t...,good,5TDYZ3DC8LS062357,38.89,-77.03,dc,97.309849,2023-06-29 12:18:14,8
8,7314542698,https://cnj.craigslist.org/ctd/d/trenton-2017-...,central NJ,https://images.craigslist.org/00H0H_3V3uEl06cB...,Carvana is the safer way to buy a car During t...,good,WVWPR7AU4HW950160,40.22,-74.76,nj,50.503253,2024-04-03 12:34:10,9
9,7308204040,https://bham.craigslist.org/ctd/d/clanton-2019...,birmingham,https://images.craigslist.org/00606_2RL4eJFKt2...,phone: ☎ (205) 862-8003 text: Text ...,excellent,5TFCZ5AN0KX185798,32.9229,-86.545,al,121.088132,2023-09-15 22:53:48,10


In [22]:
import pandas as pd

state_data = {
    'state_id': ['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
                   'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma',
                   'mi', 'mn', 'ms', 'mo', 'mt', 'nc', 'ne', 'nv', 'nj', 'nm', 'ny',
                   'nh', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
                   'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'],
    'state_name': ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'District of Columbia', 
                   'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 
                   'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 
                   'Montana', 'North Carolina', 'Nebraska', 'Nevada', 'New Jersey', 'New Mexico', 'New York', 'New Hampshire', 
                   'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 
                   'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'],
    'time_zone': ['Central', 'Alaska', 'Mountain', 'Central', 'Pacific', 'Mountain', 'Eastern', 'Eastern', 'Eastern', 'Eastern', 
                  'Eastern', 'Hawaii-Aleutian', 'Mountain', 'Central', 'Eastern', 'Central', 'Central', 'Eastern', 'Central', 
                  'Eastern', 'Eastern', 'Eastern', 'Eastern', 'Central', 'Central', 'Central', 'Mountain', 'Eastern', 'Central', 
                  'Pacific', 'Eastern', 'Mountain', 'Eastern', 'Eastern', 'Central', 'Eastern', 'Central', 'Pacific', 'Eastern', 
                  'Eastern', 'Eastern', 'Central', 'Central', 'Central', 'Mountain', 'Eastern', 'Eastern', 'Pacific', 'Eastern', 
                  'Central', 'Mountain']
}

sample_states_table = pd.DataFrame(state_data)
sample_states_table

Unnamed: 0,state_id,state_name,time_zone
0,al,Alabama,Central
1,ak,Alaska,Alaska
2,az,Arizona,Mountain
3,ar,Arkansas,Central
4,ca,California,Pacific
5,co,Colorado,Mountain
6,ct,Connecticut,Eastern
7,dc,District of Columbia,Eastern
8,de,Delaware,Eastern
9,fl,Florida,Eastern


In [23]:
num_favorites = 20
favorite_data = {
    'id': [random.choice(sample_cars_table['id']) for _ in range(num_favorites)],
    'user_id': [random.choice(sample_users_table['user_id']) for _ in range(num_favorites)]
}

sample_favorites_table = pd.DataFrame(favorite_data)
sample_favorites_table['car_id'] = sample_favorites_table['id'].map(sample_cars_table.set_index('id')['car_id'].to_dict())

sample_favorites_table

Unnamed: 0,id,user_id,car_id
0,7316737397,57354515,1
1,7316172343,37571958,13
2,7311254022,53734996,27
3,7313415077,73461497,19
4,7306160055,35781206,3
5,7311884969,53734996,47
6,7316521595,37571958,26
7,7311884969,37571958,47
8,7316737397,53734996,1
9,7308204040,73461497,10


In [24]:
# save da files
sample_cars_table.to_csv('sample_cars_table.csv', index=False)
sample_colors_table.to_csv('sample_colors_table.csv', index=False)
sample_users_table.to_csv('sample_users_table.csv', index=False)
sample_rentals_table.to_csv('sample_rentals_table.csv', index=False)
sample_listings_table.to_csv('sample_listings_table.csv', index=False)
sample_favorites_table.to_csv('sample_favorites_table.csv', index=False)
sample_states_table.to_csv('sample_states_table.csv', index=False)

### Data Summaries

In [28]:
lengths = []
for i in (df["url"] + df["image_url"]):
    lengths.append(len(str(i)))

max(lengths)

169

In [15]:
# lat and long
max_lat = max(df["lat"].unique())
min_lat = min(df["lat"].unique())

max_long = max(df["long"].unique())
min_long = min(df["long"].unique())

print("lat: [", min_lat, ",", max_lat, "] long: [", min_long, ",", max_long, "]")

lat: [ -84.122245 , 82.390818 ] long: [ -159.827728 , 173.885502 ]


In [19]:
df["state"].dropna().unique()

array(['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga',
       'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma',
       'mi', 'mn', 'ms', 'mo', 'mt', 'nc', 'ne', 'nv', 'nj', 'nm', 'ny',
       'nh', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx',
       'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'], dtype=object)

In [20]:
df["condition"].dropna().unique()

array(['good', 'excellent', 'fair', 'like new', 'new', 'salvage'],
      dtype=object)

In [23]:
user_favorites = favorites_table[favorites_table['userID'] == 'user_25764002']
user_favorites

Unnamed: 0,favoriteID,id,userID
2,ee48c532-74cb-441b-866e-bc80d93b2983,7305679700,user_25764002
62,8bc0a3ef-2b21-4166-a4f3-3c71340284a1,7315916207,user_25764002
432,70b71a2d-2668-4515-858b-43cb648357db,7316856596,user_25764002
519,f1b6b231-3410-4140-aad3-2087e9e2f1a9,7313105260,user_25764002


In [21]:
len(df['manufacturer'].unique())

42

In [19]:
df['model'].unique()

array(['sierra 1500 crew cab slt', 'silverado 1500',
       'silverado 1500 crew', ..., 'gand wagoneer', '96 Suburban',
       'Paige Glenbrook Touring'], dtype=object)

In [20]:
len(df['model'].unique())

29667