In [None]:
#Mohsin Essani
#Company_Data
#Random Forest

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the data
data = pd.read_csv("company_data.csv")



In [None]:
data.isnull().sum()

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:

# Convert Sales variable into categorical variable
data['Sales'] = pd.cut(data['Sales'], bins=[-np.inf, 7.5, np.inf], labels=['Low', 'High'])

# Scale the numerical variables
scaler = StandardScaler()
num_vars = ['CompPrice', 'Income', 'Advertising', 'Population', 'Price', 'Age', 'Education']
data[num_vars] = scaler.fit_transform(data[num_vars])



In [None]:
# One-hot encode the categorical variables
cat_vars = ['ShelveLoc', 'Urban', 'US']
data = pd.get_dummies(data, columns=cat_vars)

# Split the data into training and testing sets
X = data.drop('Sales', axis=1)
y = data['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [None]:
# Train a Random Forest model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rfc.predict(X_test)



In [None]:
# Check the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))



In [None]:
# Evaluate the model using cross validation
scores = cross_val_score(rfc, X, y, cv=5)
print("Cross validation scores: {}".format(scores))
print("Average cross validation score: {:.2f}%".format(scores.mean()*100))


In [None]:

# Visualize the feature importances of the Random Forest model
feature_importances = pd.Series(rfc.feature_importances_, index=X.columns)
top_features = feature_importances.nlargest(10)
top_features.plot(kind='barh')
plt.title("Feature Importances")
plt.show()


In [None]:

# Identify the most important variables for high sales
high_sales_features = top_features[top_features > 0.05].index
print("The most important variables for high sales are: {}".format(list(high_sales_features)))


# Based on the analysis, the variables Price, Age, CompPrice, Population, Income, Advertising, ShelveLoc_Good, and Education are the most important variables that influence high sales of the product. This means that if the company focuses on improving these variables, it is likely to achieve higher sales of the product. Additionally, the scatter plots indicate that there are some linear relationships between these variables and sales. Therefore, the company could use this information to optimize its marketing and pricing strategies to increase sales and maximize profits.

In [None]:

# Create scatter plots to explore the relationships between important variables and Sales
# Convert the 'High' column to a categorical type


# Create scatter plots to explore the relationships between important variables and Sales
sns.scatterplot(x='Income', y='Sales', data=data)
plt.title("Income vs. Sales")
plt.show()


#Income is also growing as sales is growing. 

In [None]:
sns.scatterplot(x='Price', y='Sales', data=data)
plt.title("Price vs. Sales")
plt.show()

#Even if Sales increase, price will also increase. 

In [None]:
# Create bar charts to compare average Sales by important variables

sns.barplot(x='ShelveLoc_Good', y='Sales', data=data)
plt.title("Average Sales by ShelveLoc")
plt.show()



In [None]:

sns.barplot(x='US_Yes', y='Sales', data=data)
plt.title("Average Sales by US")
plt.show()

#US_Yes are increasing in average 

In [None]:
# Print classification report to evaluate model performance for high sales
y_test_high = (y_test == 'High')
y_pred_high = (y_pred == 'High')
print("Classification Report for High Sales:")
print(classification_report(y_test_high, y_pred_high))