# Question 2

Can we predict how many stars a business will have?

In [1]:
import pandas as pd
import ProcessData as p
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read in data   
business_file = 'yelp_dataset/yelp_academic_dataset_business.json'
business_df = pd.read_json(business_file, lines=True)

In [3]:
# Process categories and attributes columns
categories = ['Restaurants', 'Shopping', 'Nightlife', 'Active Life', 'Beauty & Spas', 'Automotive', 'Home Services']
business_df = p.process_business(business_df, categories)

# Drop the columns that were processed
business_df.drop(['attributes', 'categories'], axis='columns', inplace=True)

In [4]:
# Check for remaining missing values
business_df.isnull().mean().sort_values(ascending=False).head(10)

hours                     0.237559
latitude                  0.000032
longitude                 0.000032
Home Services             0.000000
BusinessParking_garage    0.000000
BestNights_monday         0.000000
BestNights_saturday       0.000000
BestNights_sunday         0.000000
BestNights_thursday       0.000000
BestNights_tuesday        0.000000
dtype: float64

In [5]:
business_df.drop(['hours'], axis='columns', inplace=True)

Drop the hours column for now. There are too many missing values and preprocessing would need to be done to make the column useable.

In [6]:
business_df.dropna(subset=['latitude', 'longitude'], axis='rows', inplace=True)

In [7]:
# Drop columns that are essentially unique identifiers
business_df.drop(['address', 'business_id', 'name'], axis='columns', inplace=True)

In [8]:
business_df.drop(['city', 'neighborhood', 'postal_code'], axis='columns', inplace=True)

In [9]:
business_df = pd.get_dummies(business_df, drop_first=True)

In [10]:
business_df.columns

Index(['is_open', 'latitude', 'longitude', 'review_count', 'stars',
       'AcceptsInsurance', 'Ambience_casual', 'Ambience_classy',
       'Ambience_divey', 'Ambience_hipster',
       ...
       'RestaurantsPriceRange2_1', 'RestaurantsPriceRange2_2',
       'RestaurantsPriceRange2_3', 'RestaurantsPriceRange2_4', 'Smoking_no',
       'Smoking_outdoor', 'Smoking_yes', 'WiFi_free', 'WiFi_no', 'WiFi_paid'],
      dtype='object', length=180)

## Linear Model

In [11]:
y = business_df.stars
business_df.drop(['stars'], axis='columns', inplace=True)
X = business_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=42)

In [12]:
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [13]:
y_pred = lm_model.predict(X_train)
print(r2_score(y_train, y_pred))
print(mean_squared_error(y_train, y_pred))

0.10279718504958257
0.9303456135715552


In [14]:
y_pred = lm_model.predict(X_test)
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

-24813231.238857467
25473944.071368743


In [15]:
temp = list(zip(y_test, y_pred))
temp[:5]

[(4.0, 3.293384121270088),
 (4.0, 3.8658198012402787),
 (3.5, 3.713004050551749),
 (4.0, 3.45502106091394),
 (1.5, 3.6846348977843393)]