In [None]:
import pandas as pd
filepath = "./data/catA_train.csv"
df1 = pd.read_csv(filepath)
display(df1)

In [None]:
df1.drop(['AccountID', 'Company', 'Industry', 'Square Footage', 'Employees (Single Site)', 'Employees (Domestic Ultimate Total)', 'Employees (Global Ultimate Total)', 'Parent Company', 'Parent Country'], axis=1, inplace=True)
df2 = df1.dropna(subset=["LATITUDE", "LONGITUDE"])
print(df2.isna().sum())

In [None]:
df2.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
df2['Import/Export Status'].nunique()
df2['Import/Export Status'].unique()

In [None]:
df3 = df2[df2['Company Status (Active/Inactive)'] == 'Active']
df3['Import/Export Status_Missing'] = df2['Import/Export Status'].isna()
df4 = pd.get_dummies(df3, columns=['Entity Type'], prefix='Entity_Type')
df5 = pd.get_dummies(df4, columns=['Ownership Type'], prefix='Ownership_Type')
df6 = pd.get_dummies(df5, columns=['Import/Export Status'], prefix='Import_Export_Status')

In [None]:
sic_code_frequency = df6['SIC Code'].value_counts()
threshold = 20
low_frequency_sic_codes = sic_code_frequency[sic_code_frequency < threshold].index
df6['SIC Code'] = df6['SIC Code'].replace(low_frequency_sic_codes, 'Others')
sic_code_frequency1 = df6['SIC Code'].value_counts()
print(sic_code_frequency1)

In [None]:
df6['SIC Code'] = df6['SIC Code'].astype(str)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df6['SIC Code'] = label_encoder.fit_transform(df6['SIC Code'])
df6["SIC Code"].unique()

In [None]:
df6.drop(['8-Digit SIC Code', '8-Digit SIC Description', 'Global Ultimate Company', 'Domestic Ultimate Company'], axis=1, inplace=True)
global_country_frequency = df6['Global Ultimate Country'].value_counts()
threshold = 1
low_frequency_global_country = global_country_frequency[global_country_frequency < threshold].index
df6['Global Ultimate Country'] = df6['Global Ultimate Country'].replace(low_frequency_global_country, 'Others')
global_country_frequency1 = df6['Global Ultimate Country'].value_counts()
print(global_country_frequency1)

In [None]:
df6['Global Ultimate Country'] = df6['Global Ultimate Country'].astype(str)
df6['Global Ultimate Country'] = label_encoder.fit_transform(df6['Global Ultimate Country'])
df6["Global Ultimate Country"].unique()

In [None]:
df6.drop(['Company Description'], axis=1, inplace=True)
df6 = df6.dropna(subset=['Year Found'])
df6['Is Domestic Ultimate'] = df6['Is Domestic Ultimate'] == 1
df6['Is Global Ultimate'] = df6['Is Global Ultimate'] == 1
display(df6)

In [None]:
numerical_df = df6.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numerical_df.corr()
correlation_df = sns.heatmap(correlation_matrix, annot = True, fmt = '.2f', cmap = 'coolwarm')
display(correlation_df)

In [None]:
from matplotlib.colors import LogNorm
norm = LogNorm()
cmap = 'plasma'
scatter = plt.scatter(df6['LONGITUDE'], df6['LATITUDE'], c=df6['Sales (Domestic Ultimate Total USD)'], cmap=cmap, s=5, norm=norm)
cbar = plt.colorbar(scatter, label='Sales')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Scatter Plot with Colors Based on Sales (LogNorm)')
plt.show()

In [None]:
constant_color = 'purple'
sic_palette = sns.color_palette('Set1', n_colors=len(df6['SIC Code'].unique()))
sic_colors = dict(zip(df6['SIC Code'].unique(), sic_palette))
scatter = plt.scatter(df6['LONGITUDE'], df6['LATITUDE'], c=df6['SIC Code'].map(sic_colors), s=3, alpha=norm(df6['Sales (Domestic Ultimate Total USD)']))
cbar = plt.colorbar(scatter, label='SIC Code')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Scatter Plot with Varying Color by SIC Code and Varying Alpha Based on Sales (LogNorm)')
plt.show()

In [None]:
df6.drop(['Company Status (Active/Inactive)', 'Fiscal Year End', 'Entity_Type_Branch', 'Entity_Type_Independent', 'Entity_Type_Parent', 'Entity_Type_Subsidiary', 'Ownership_Type_Non-Corporates', 'Ownership_Type_Nonprofit', 'Ownership_Type_Partnership',
       'Ownership_Type_Private', 'Ownership_Type_Public', 'Ownership_Type_Public Sector','Import_Export_Status_Both Imports & Exports','Import/Export Status_Missing'], axis=1, inplace=True)
df6['Is Domestic Ultimate'] = df6['Is Domestic Ultimate'] == 1
df6['Is Global Ultimate'] = df6['Is Global Ultimate'] == 1
df6['Domestic Ultimate'] = df6['Is Domestic Ultimate'].astype(int)
df6['Global Ultimate'] = df6['Is Global Ultimate'].astype(int)
df6.drop(['Is Domestic Ultimate', 'Is Global Ultimate'], axis=1, inplace=True)
df6['Exports'] = df6['Import_Export_Status_Exports'].astype(int)
df6['Imports'] = df6['Import_Export_Status_Imports'].astype(int)
df6.drop(['Import_Export_Status_Exports', 'Import_Export_Status_Imports'], axis=1, inplace=True)
display(df6)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
X = df6.drop('Sales (Domestic Ultimate Total USD)', axis=1)
y = df6['Sales (Domestic Ultimate Total USD)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = GradientBoostingRegressor(random_state=42)

In [None]:
model_10 = GradientBoostingRegressor(random_state=42)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2')

print("Cross-Validation R-squared scores:", cv_scores)

print("Mean R-squared score:", cv_scores.mean())
print("Standard Deviation of R-squared scores:", cv_scores.std())

model_10.fit(X_train, y_train)

test_score = model_10.score(X_test, y_test)
print("Test R-squared score:", test_score)

In [None]:
import joblib
joblib.dump(model_10, 'base_model.h5')

In [None]:
def test_model(data):
    loaded_model = joblib.load('./base_model.h5')
    predictions = loaded_model.predict(data)
    return predictions

last_row = X_test.iloc[[1]]

print(test_model(last_row))

In [None]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''

    result = [] 
    return result

In [None]:
test_df = pd.read_csv(filepath)
test_df = test_df.drop(columns=['Sales (Domestic Ultimate Total USD)'])
print(testing_hidden_data(test_df))