# US Restaurant data analysis

## Import/Load the Data

In [1]:
import pandas as pd

df = pd.read_csv('restaurant_data_analysis.csv')

## Checking the structure, contents, and statistics of the data

In [2]:
# the number of rows and columns in the DataFrame
df.shape

(10000, 8)

In [3]:
# names of all columns in the DataFrame
df.columns

Index(['name', 'categories', 'city', 'state', 'address', 'postalCode',
       'latitude', 'longitude'],
      dtype='object')

In [4]:
# summary of the DataFrame, non-null values, data types of columns, and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        10000 non-null  object 
 1   categories  10000 non-null  object 
 2   city        10000 non-null  object 
 3   state       10000 non-null  object 
 4   address     10000 non-null  object 
 5   postalCode  10000 non-null  object 
 6   latitude    10000 non-null  float64
 7   longitude   10000 non-null  float64
dtypes: float64(2), object(6)
memory usage: 625.1+ KB


In [5]:
# the first few rows
df.head()

Unnamed: 0,name,categories,city,state,address,postalCode,latitude,longitude
0,Carl's Jr.,"Fast Food Restaurants,Hamburgers and Hot Dogs,...",Greeley,CO,2555 11th Avenue,80631,40.39629,-104.69699
1,Del Taco,"Restaurant,Mexican Restaurants,Fast Food Resta...",Grand Junction,CO,2513 Highway 6 And 50,81505,39.08135,-108.58689
2,Which Wich,"Sandwich Shops,Fast Food Restaurants,Restauran...",Grand Junction,CO,1125 Patterson Road,81506,39.09148,-108.55411
3,Chipotle Mexican Grill,"Fast Food Restaurants,Mexican Restaurants,Rest...",Aurora,CO,3455 N Salida Court,80011,39.76369,-104.77671
4,Taco Bell,"Fast Food Restaurants,Mexican Restaurants,Rest...",Denver,CO,5225 E Colfax Avenue,80220,39.74044,-104.92636


In [6]:
# the last few rows
df.tail()

Unnamed: 0,name,categories,city,state,address,postalCode,latitude,longitude
9995,Church's Chicken Of Birmingham,"American Restaurants,Caterers,Soul Food Restau...",Bessemer,AL,1803 Carolina Ave,35020,33.399574,-86.95105
9996,Luxury Box,"Mexican Restaurants,Caterers,Delicatessens,Iri...",Raleigh,NC,8511 Cantilever Way Ste 105,27613-6996,35.8845,-78.737541
9997,McDonald's,"Carry-out food,Hot dog restaurants,Fast Food R...",Plover,WI,2601 Plover Rd,54467,44.45318,-89.54317
9998,Pizza Hut,"Carry-out food,Caterers,Restaurant delivery se...",Hendersonville,TN,170 E Main St,37075,36.308262,-86.607339
9999,Burger King,"Fast Food Restaurant,Restaurants",Durham,NC,3400 Westgate Drive,27707-2696,35.965869,-78.963366


In [7]:
# descriptive statistics of numerical columns (count, mean, std, min, max, etc.)
df.describe(include='all')

Unnamed: 0,name,categories,city,state,address,postalCode,latitude,longitude
count,10000,10000,10000,10000,10000,10000.0,10000.0,10000.0
unique,1545,5670,2810,47,9954,5310.0,,
top,McDonald's,"Fast Food Restaurants,Hamburgers and Hot Dogs,...",Columbus,OH,2005 Kalia Rd,37040.0,,
freq,765,182,105,922,4,17.0,,
mean,,,,,,,37.298215,-93.284547
std,,,,,,,5.566995,16.421424
min,,,,,,,19.63956,-159.46431
25%,,,,,,,33.850652,-98.342758
50%,,,,,,,37.490189,-86.677452
75%,,,,,,,40.808905,-81.803061


## Cleaning the data

In [8]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [9]:
# Handle missing values
# Check for missing values in each column
df.isnull().sum()

name          0
categories    0
city          0
state         0
address       0
postalCode    0
latitude      0
longitude     0
dtype: int64

In [10]:
# If latitude/longitude is missing, we may choose to drop those rows for geospatial analysis
df.dropna(subset=['latitude', 'longitude'], inplace=True)

In [11]:
# Standardize text data (converting to lowercase, stripping extra spaces)
df['name'] = df['name'].str.lower().str.strip()
df['categories'] = df['categories'].str.lower().str.strip()
df['city'] = df['city'].str.lower().str.strip()
df['address'] = df['address'].str.lower().str.strip()

In [12]:
# Handle outliers in latitude and longitude
# Normally, US latitudes range from about 24 to 49, and longitudes range from -125 to -66.
df = df[(df['latitude'] >= 24) & (df['latitude'] <= 49)]
df = df[(df['longitude'] >= -125) & (df['longitude'] <= -66)]