**Austin data set cleaning and merging:**
1. merged austin_intakes and austin_outcomes by Animal ID, and discarded any rows that don't have matching Animal IDs -> austin (merged df)

3. remove "*" in the Name_x and Name_y entries

2. finding mismatches between col_x and col_y of austin
- MonthYear mismatches + inspecting DateTime: means time of intake or outcome
- deleting duplicate columns that have no mismatches (Name_y, AnimalType_y, Breed_y, Color_y)
- renamed x columns to without "_x"

3. checking and dropping duplicate rows (110 rows)

4. removing unwanted attributes (Outcome Subtype)

About rows with missing values...
- rows with missing values removed from visualization data set
- need further investigation for ML dataset (accuracy, ROC AUC score)

In [25]:
import pandas as pd
import matplotlib.pyplot as plt


austin_intakes = pd.read_csv('https://query.data.world/s/untanflczmb6zmswifydyqtg4mf6db?dws=00000')
austin_outcomes = pd.read_csv('https://query.data.world/s/jetxuufbm75thutkjc3ryrtmzaau36?dws=00000')
# austin_intakes = pd.read_csv("Austin Animal Center Intakes.csv")
# austin_outcomes = pd.read_csv("Austin Animal Center Outcomes.csv")

print(austin_intakes.shape)
print(austin_outcomes.shape)

#merging 2 data sets based on Animal ID and inner join that discards any rows that don't have matching Animal IDs
austin = pd.merge(austin_intakes, austin_outcomes, on = "Animal ID", how="inner")
print(austin.head())
print(austin.shape) #less rows than the individual rows combined meaning some rows were discarded due to non-matching Animal IDs

(167243, 12)
(167207, 12)
  Animal ID        Name_x              DateTime_x   MonthYear_x  \
0   A786884        *Brock  01/03/2019 04:19:00 PM  January 2019   
1   A706918         Belle  07/05/2015 12:59:00 PM     July 2015   
2   A724273       Runster  04/14/2016 06:43:00 PM    April 2016   
3   A665644           NaN  10/21/2013 07:59:00 AM  October 2013   
4   A857105  Johnny Ringo  05/12/2022 12:23:00 AM      May 2022   

                        Found Location    Intake Type Intake Condition  \
0  2501 Magin Meadow Dr in Austin (TX)          Stray           Normal   
1     9409 Bluegrass Dr in Austin (TX)          Stray           Normal   
2   2818 Palomino Trail in Austin (TX)          Stray           Normal   
3                          Austin (TX)          Stray             Sick   
4   4404 Sarasota Drive in Austin (TX)  Public Assist           Normal   

  Animal Type_x Sex upon Intake Age upon Intake  ...              DateTime_y  \
0           Dog   Neutered Male         2 year

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#remove "*" in Name_x and Name_y
austin['Name_x'] = austin['Name_x'].str.replace('*', '', regex=False)
austin['Name_y'] = austin['Name_y'].str.replace('*', '', regex=False)

In [None]:
# print(austin_intakes.columns)
# print(austin_outcomes.columns)
# print(austin.columns)

#function that finds mismatches between 2 columns (not counting NaN since NaN are auto mismatches)
def find_mismatches(df, col_x, col_y):
  mismatches_not_nan = df[(df[col_x] != df[col_y]) & (~df[col_x].isna()) & (~df[col_y].isna())]
  return mismatches_not_nan[[col_x, col_y]]

print(find_mismatches(austin, "Name_x", "Name_y"))
print(find_mismatches(austin, "MonthYear_x", "MonthYear_y")) #diff formats -> further checks
print(find_mismatches(austin, "Animal Type_x", "Animal Type_y"))
print(find_mismatches(austin, "Breed_x", "Breed_y"))
print(find_mismatches(austin, "Color_x", "Color_y"))

#further checking MonthYear: check if the 3 letters in MonthYear_y is in MonthYear_x, or year in MonthYear_y = MonthYear_x
mismatches_custom = austin[(austin['MonthYear_x'].str[:3].str.lower() != austin['MonthYear_y'].str[:3].str.lower()) | (austin['MonthYear_x'].str[-4:] != austin['MonthYear_y'].str[-4:])]
print(mismatches_custom[['MonthYear_x', 'MonthYear_y']])

#renaming MonthYear_x to MonthYear_intake, MonthYear_y to MonthYear_outcome (same applies to DateTime)
austin = austin.rename(columns={
    'MonthYear_x': 'MonthYear_intake',
    'MonthYear_y': 'MonthYear_outcome',
    'DateTime_x': 'DateTime_intake',
    'DateTime_y': 'DateTime_outcome'
})

#dropping columns with no mismatches (name_y, animaltype_y, breed_y, color_y)
austin = austin.drop(austin.filter(like='_y').columns, axis=1)

#renaming "_x" columns
austin.columns = austin.columns.str.replace("_x", "")
print(austin.columns)

In [None]:
#checking and dropping duplicate rows
print(austin.duplicated().sum())
austin = austin.drop_duplicates()

In [None]:
#removing unwanted attribute (Outcome Subtype)
austin = austin.drop('Outcome Subtype', axis=1)

In [None]:
#before removing missing values, write file for ML
austin.to_csv("/content/austin_ML.csv", index=False)

In [None]:
#checking missing values per column
missing_values = austin.isna().sum()
print(missing_values)

#removing all rows with missing values
austin_cleaned = austin.dropna()

In [None]:
#write cleaned file for data visualization
austin_cleaned.to_csv("/content/austin_VIS.csv", index=False)

All data set merging and cleaning:
1. attributes not in all data sets removed

2. combined similar attributes together
  - Sex upon outcome chosen over Sex upon intake to merge with Sex (higher feature importance score, higher correlation with Outcome Type)
  - Age upon outcome chosen over Age upon intake to merge with animalage (consistency with Sex upon outcome)

About rows with missing values...
- rows with missing values removed from visualization data set
- need further investigation for ML dataset (accuracy, ROC AUC score)

In [None]:
austin_cleaned.columns

In [None]:
austin_cleaned['Outcome Type'].unique()

In [None]:
# Assuming `austin_cleaned` is your DataFrame
outcomes = austin_cleaned.groupby(['Animal Type', 'Outcome Type']).size().reset_index(name='N')

# Pivot the data to get the proportions
outcomes_pivot = outcomes.pivot(index='Animal Type', columns='Outcome Type', values='N').fillna(0)
outcomes_pivot = outcomes_pivot.div(outcomes_pivot.sum(axis=1), axis=0)  # Normalize to get proportions

# Plotting the stacked bar chart
outcomes_pivot.plot(kind='bar', stacked=True, color  = [
    '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
    '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080'
],edgecolor='black')

plt.xlabel('Animal')
plt.ylabel('Proportion')
plt.title('Outcomes')
plt.legend(title='OutcomeType', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
#opening all data sets
california = pd.read_csv("california-animal-data.csv")
indiana = pd.read_csv("indiana-animal-data.csv")
austin = pd.read_csv("austin_VIS.csv")

#adding a state attribute to identify which state each pet came from
austin["State"] = "Austin"
california["State"] = "California"
indiana["State"] = "Indiana"

#creating integrated data set
optipaw = pd.concat([austin, california, indiana], ignore_index=True)

#dropping irrelevant columns, columns not in all data sets
optipaw = optipaw.drop(['sheltercode',
                        'identichipnumber',
                        'Outcome Subtype',
                        'Intake Subtype',
                        'intake_is_dead',
                        'outcome_is_dead',
                        'outcome_is_dead',
                        'was_outcome_alive',
                        'istransfer',
                        'istrial',
                        'isdoa',
                        'Intake Condition',
                        'latitude',
                        'longitude', 'Crossing',
                        'Jurisdiction',
                        'Secondary Color',
                        'geopoint',
                        'MonthYear_intake',
                        'MonthYear_outcome', 'deceaseddate',
                        'deceasedreason',
                        'diedoffshelter',
                        'puttosleep',
                        'returndate',
                        'returnedreason',
                        'Sex upon Intake',
                        'Age upon Intake'], axis=1)

#ensuring no duplicate rows, remove if found
print(optipaw.duplicated().sum())
optipaw = optipaw.drop_duplicates()

#attributes of 3 data sets
print(california.columns)
print(indiana.columns)
print(austin.columns)

In [None]:
#combining "Animal ID" and "id" into "Animal ID"
optipaw['Animal ID'] = optipaw['Animal ID'].fillna(optipaw['id'])
optipaw = optipaw.drop(['id'], axis=1)

#combining "Name", "Animal Name" and "animalname" into "Name"
optipaw['Name'] = optipaw['Name'].fillna(optipaw['Animal Name']).fillna(optipaw['animalname'])
optipaw = optipaw.drop(['Animal Name', 'animalname'], axis=1)

#combining "Animal Type" and "speciesname" into "Animal Type"
optipaw['Animal Type'] = optipaw['Animal Type'].fillna(optipaw['speciesname'])
optipaw = optipaw.drop(['speciesname'], axis=1)

#combining "Date of Birth" and "DOB" into "DOB"
optipaw['DOB'] = optipaw['DOB'].fillna(optipaw['Date of Birth'])
optipaw = optipaw.drop(['Date of Birth'], axis=1)

#combining 'Primary Color' and 'basecolour' into 'Color'
optipaw['Color'] = optipaw['Color'].fillna(optipaw['basecolour']).fillna(optipaw['Primary Color'])
optipaw = optipaw.drop(['Primary Color', 'basecolour'], axis=1)

#combining 'Sex upon Outcome', 'Sex' and 'sexname' into 'Sex' - 'Sex upon Outcome' has higher importance and correlation score with 'Outcome Type'
optipaw.loc[:, 'Sex'] = optipaw['Sex upon Outcome'].fillna(optipaw['Sex']).fillna(optipaw['sexname'])
optipaw = optipaw.drop(['Sex upon Outcome', 'sexname'], axis=1)

#combining 'Age upon Outcome' and 'animalage into 'Age' - 'Age upon Outcome' consistent with 'Sex upon Outcome' though importance and correlation scores are contradictory
optipaw.loc[:, 'Age'] = optipaw['Age upon Outcome'].fillna(optipaw['animalage'])
optipaw = optipaw.drop(['Age upon Outcome', 'animalage'], axis=1)

#combining 'Breed' and 'breedname' into 'Breed'
optipaw['Breed'] = optipaw['Breed'].fillna(optipaw['breedname'])
optipaw = optipaw.drop(['breedname'], axis=1)

#combining "Intake Type", "Reason for Intake", "intakereason" into "Intake Reason"
optipaw['Intake Reason'] = optipaw['Intake Type'].fillna(optipaw['Reason for Intake']).fillna(optipaw['intakereason'])
optipaw = optipaw.drop(['Intake Type', 'Reason for Intake', 'intakereason'], axis=1)

#change all to YYYY-MM-DD format and combining 'DateTime_intake', 'Intake Date', 'intakedate' into 'Intake Date'
optipaw['DateTime_intake'] = pd.to_datetime(optipaw['DateTime_intake'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce').dt.strftime('%Y-%m-%d')
optipaw['Intake Date'] = pd.to_datetime(optipaw['Intake Date'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce').dt.strftime('%Y-%m-%d')
optipaw['intakedate'] = pd.to_datetime(optipaw['intakedate'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce').dt.strftime('%Y-%m-%d')
optipaw['Intake Date'] = optipaw['DateTime_intake'].fillna(optipaw['Intake Date']).fillna(optipaw['intakedate'])
optipaw = optipaw.drop(['DateTime_intake', 'intakedate'], axis=1)

#change all to YYYY-MM-DD format and combining 'DateTime_outcome', 'Outcome Date', 'movementdate' into 'Outcome Date'
optipaw['DateTime_outcome'] = pd.to_datetime(optipaw['DateTime_outcome'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce').dt.strftime('%Y-%m-%d')
optipaw['Outcome Date'] = pd.to_datetime(optipaw['Outcome Date'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce').dt.strftime('%Y-%m-%d')
optipaw['movementdate'] = pd.to_datetime(optipaw['movementdate'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce').dt.strftime('%Y-%m-%d')
optipaw['Outcome Date'] = optipaw['DateTime_outcome'].fillna(optipaw['Outcome Date']).fillna(optipaw['movementdate'])
optipaw = optipaw.drop(['DateTime_outcome', 'movementdate'], axis=1)

#combining 'movementtype' and 'Outcome Type' into 'Outcome Type'
optipaw['Outcome Type'] = optipaw['Outcome Type'].fillna(optipaw['movementtype'])
optipaw = optipaw.drop(['movementtype'], axis=1)

#combining 'Found Location' and 'location' into 'Found Location'
optipaw['Found Location'] = optipaw['Found Location'].fillna(optipaw['location'])
optipaw = optipaw.drop(['location'], axis=1)

optipaw.columns


In [None]:
optipaw['Name'] = optipaw['Name'].str.title()
optipaw['Animal Type'] = optipaw['Animal Type'].str.title()
optipaw['Breed'] = optipaw['Breed'].str.title()
optipaw['Color'] = optipaw['Color'].str.title()
optipaw['Outcome Type'] = optipaw['Outcome Type'].str.title()
optipaw['Sex'] = optipaw['Sex'].str.title()
optipaw['Intake Reason'] = optipaw['Intake Reason'].str.title()

In [None]:
#before dropping missing values, write file for ML
optipaw.to_csv("/content/optipaw_ML.csv", index=False)

In [None]:
missing_values = optipaw.isna().sum()

print(missing_values)

# Get the total number of rows
total_rows = len(optipaw)

# Calculate missing values as proportion of total rows
missing_values_proportion = optipaw.isna().sum() / total_rows

# Print the result
print(missing_values_proportion)

print(optipaw.shape)


import matplotlib.pyplot as plt
from upsetplot import UpSet, from_indicators

# Create a boolean DataFrame for missing values
missing_values = optipaw.isna()

# Generate an UpSet plot based on missing data
upset_data = from_indicators(missing_values)

# Plot the UpSet chart
UpSet(upset_data).plot()

# Show the plot
plt.show()


optipaw_cleaned = optipaw.dropna()
optipaw_cleaned = optipaw_cleaned[optipaw_cleaned['Sex'] != 'Unknown']

optipaw_cleaned.to_csv("/content/optipaw_VIS.csv", index=False)

print(optipaw_cleaned.shape) #less than half of the rows were deleted

In [None]:
optipaw_mistake = pd.read_csv("optipaw.csv")
missing_values_mistakes = optipaw_mistake.isna().sum()
print(missing_values_mistakes)

In [None]:
# import pandas as pd
# import numpy as np
# import re
# from sklearn.impute import SimpleImputer


# missing_val_dataset = pd.read_csv('austin_VIS.csv')
# val_dataset = pd.read_csv('austin_ML.csv')

# # display(missing_val_dataset.head())
# # display(val_dataset.head())


# # EDA for machine learning


# def preprocess(df):
#     # Remove A for the Animal ID, and give 0 if invalid
#     df['Animal ID'] = df['Animal ID'].apply(lambda x: 1 if isinstance(x, str) else 0)

#     # 1 if it is a valid string, else make it 0
#     df['Name'] = df['Name'].apply(lambda x: 1 if isinstance(x, str) else 0)

#     # Fix DateTime_Intake
#     def classify_datetime(value):
#         if isinstance(value, str):
#             if re.match(r'^\d{5,}\.\d+$', value):
#                 return 1
#             elif re.match(r'^\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2} (AM|PM)$', value):
#                 return 2
#         return 0

#     df['DateTime_intake'] = df['DateTime_intake'].apply(classify_datetime)

#     # Fix MonthYear_intake


#     df['MonthYear_intake'] = df['MonthYear_intake'].apply(lambda x: 1 if isinstance(x, (int, float)) and not pd.isna(x) else 0)

#     # Found Location 1 if string, else 0
#     df['Found Location'] = df['Found Location'].apply(lambda x: 1 if isinstance(x, str) else 0)

#     # Intake Type mapping
#     intake_type_map = {
#         'Stray': 1,
#         'Public Assist': 2,
#         'Owner Surrender': 3,
#         'Abandoned': 4,
#         'Wildlife': 5,
#         'Euthanasia Request': 6
#     }

#     df['Intake Type'] = df['Intake Type'].map(intake_type_map).fillna(0).astype(int)

#     # Intake Condition mapping
#     intake_condition_map = {
#         'Normal': 1,
#         'Sick': 2,
#         'Injured': 3,
#         'Pregnant': 4,
#         'Neonatal': 5,
#         'Nursing': 6,
#         'Aged': 7,
#         'Medical': 8,
#         'Unknown': 9,
#         'Med Attn': 10,
#         'Other': 11,
#         'Behavior': 12,
#         'Feral': 13,
#         'Med Urgent': 14,
#         'Parvo': 15,
#         'Space': 16,
#         'Agonal': 17,
#         'Neurologic': 18,
#         'Panleuk': 19,
#         'Congenital': 20
#     }

#     df['Intake Condition'] = df['Intake Condition'].map(intake_condition_map).fillna(0).astype(int)

#     # Animal Type mapping
#     animal_type_map = {
#         'Dog': 1,
#         'Cat': 2,
#         'Other': 3,
#         'Bird': 4,
#         'Livestock': 5
#     }

#     df['Animal Type'] = df['Animal Type'].map(animal_type_map).fillna(0).astype(int)

#     # Sex upon Intake mapping
#     sex_upon_intake_map = {
#         'Neutered Male': 1,
#         'Spayed Female': 2,
#         'Intact Male': 3,
#         'Intact Female': 4,
#         'Unknown': 5
#     }

#     df['Sex upon Intake'] = df['Sex upon Intake'].map(sex_upon_intake_map).fillna(0).astype(int)

#     # Age to days
#     def age_to_days(age_str):
#         if isinstance(age_str, str):
#             age_str = age_str.lower().replace(' ', '')
#             years = months = days = 0
#             if 'year' in age_str:
#                 years = int(age_str.split('year')[0].strip().replace('s', ''))
#             if 'month' in age_str:
#                 months = int(age_str.split('month')[0].split()[-1].strip())
#             if 'day' in age_str:
#                 days = int(age_str.split('day')[0].split()[-1].strip())
#             total_days = years * 365 + months * 30 + days
#             return abs(total_days)
#         return 0

#     df['Age upon Intake'] = df['Age upon Intake'].apply(age_to_days)

#     # Breed binary
#     df['Breed'] = df['Breed'].apply(lambda x: 1 if isinstance(x, str) else 0)

#     # Color binary
#     df['Color'] = df['Color'].apply(lambda x: 1 if isinstance(x, str) else 0)

#     # DateTime_outcome
#     df['DateTime_outcome'] = df['DateTime_outcome'].apply(classify_datetime)

#     # MonthYear_outcome
#     df['MonthYear_outcome'] = df['MonthYear_outcome'].apply(lambda x: 1 if isinstance(x, (int, float)) and not pd.isna(x) else 0)

#     # Date of Birth classification
#     def classify_dob(value):
#         if isinstance(value, int):
#             return 1
#         elif isinstance(value, str) and re.match(r'^\d{2}/\d{2}/\d{4}$', value):
#             return 2
#         return 0

#     df['Date of Birth'] = df['Date of Birth'].apply(classify_dob)

#     # Outcome Type mapping
#     outcome_type_map = {
#         'Transfer': 1,
#         'Return to Owner': 2,
#         'Adoption': 3,
#         'Euthanasia': 4,
#         'Disposal': 5,
#         'Died': 6,
#         'Rto-Adopt': 7,
#         'Missing': 8,
#         '0': 9,
#         'Relocate': 10,
#         'Lost': 11,
#         'Stolen': 12
#     }

#     df['Outcome Type'] = df['Outcome Type'].map(outcome_type_map).fillna(0).astype(int)

#     # Sex upon Outcome mapping
#     sex_upon_outcome_map = {
#         'Neutered Male': 1,
#         'Spayed Female': 2,
#         'Intact Female': 3,
#         'Unknown': 4,
#         'Intact Male': 5
#     }

#     df['Sex upon Outcome'] = df['Sex upon Outcome'].map(sex_upon_outcome_map).fillna(0).astype(int)

#     # Age upon Outcome
#     df['Age upon Outcome'] = df['Age upon Outcome'].apply(age_to_days)

#     # Impute missing values
#     numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
#     categorical_cols = df.select_dtypes(include=['object']).columns

#     numeric_imputer = SimpleImputer(strategy='median')
#     df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

#     categorical_imputer = SimpleImputer(strategy='most_frequent')
#     #df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

#     return df

# # Process datasets
# cleaned_missing = preprocess(missing_val_dataset)
# cleaned_ok = preprocess(val_dataset)

# # Output to CSV files
# #cleaned_missing.to_csv('missing_file.csv', index=False)
# #cleaned_ok.to_csv('normal_file.csv', index=False)


# a = cleaned_missing
# x_train = a.drop(columns=a.columns[15])  # Features
# y_train = a.iloc[:, 15]  # Labels - outcome type

# b = cleaned_ok
# x1_train = b.drop(columns=a.columns[15])  # Features
# y1_train = b.iloc[:, 15]  # Labels - outcome type

# # taking first 10k value
# x_train = x_train.head(10000)
# y_train = y_train.head(10000)

# x1_train = x1_train.head(10000)
# y1_train = y1_train.head(10000)


# print(x_train.head())
# print(y_train.head())

# print(x1_train.head())
# print(y1_train.head())





In [None]:
# # Random forrest test with accuracy score and ROC

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score, accuracy_score
# from sklearn.preprocessing import label_binarize
# import numpy as np

# def evaluate_random_forest(X_train, y_train):
#     # Initialize RandomForestClassifier with fewer estimators and max_depth
#     rf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)

#     # Fit the model
#     rf.fit(X_train, y_train)

#     # Predict probabilities
#     y_train_proba = rf.predict_proba(X_train)

#     # Predict labels
#     y_train_pred = rf.predict(X_train)

#     # Calculate ROC AUC score for multi-class classification
#     # Binarize the output labels
#     y_train_bin = label_binarize(y_train, classes=np.unique(y_train))

#     # Calculate ROC AUC score
#     roc_auc = roc_auc_score(y_train_bin, y_train_proba, average='macro', multi_class='ovr')

#     # Calculate Accuracy
#     accuracy = accuracy_score(y_train, y_train_pred)

#     return accuracy, roc_auc

# accuracy, roc_auc = evaluate_random_forest(x_train, y_train)
# print("Cleaned Missing:")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"ROC AUC Score: {roc_auc:.4f}")

# accuracy, roc_auc = evaluate_random_forest(x1_train, y1_train)
# print("Cleaned Normal:")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"ROC AUC Score: {roc_auc:.4f}")



In [None]:
# # Feature importance score for Sex upon Intake and Sex upon Outcome

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, roc_auc_score
# import matplotlib.pyplot as plt
# import pandas as pd

# # Define features of interest
# features_of_interest = ['Age upon Intake', 'Age upon Outcome']

# # Extract features and labels from the first 10,000 rows
# x = x1_train[features_of_interest]  # Select columns by passing a list of column names
# y = y1_train  # outcome type

# # Ensure y is numeric (if it's not already)
# y = pd.to_numeric(y, errors='coerce')

# # Train a Random Forest model
# rf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
# rf.fit(x, y)

# # Get feature importances
# importances = rf.feature_importances_

# # Create a DataFrame for visualization
# importance_df = pd.DataFrame({
#     'Feature': features_of_interest,
#     'Importance': importances
# }).sort_values(by='Importance', ascending=False)

# # Print the feature importance scores
# print(importance_df)

# # Visualize the importances
# plt.figure(figsize=(10, 6))
# plt.barh(importance_df['Feature'], importance_df['Importance'])
# plt.xlabel('Importance')
# plt.title('Feature Importance Scores for Selected Features')
# plt.gca().invert_yaxis()  # Highest importance at the top
# plt.show()

# # Evaluate model performance
# y_pred = rf.predict(x)  # Use x instead of X
# accuracy = accuracy_score(y, y_pred)
# roc_auc = roc_auc_score(y, rf.predict_proba(x), multi_class='ovr')  # For multi-class ROC AUC

# print(f"Accuracy: {accuracy:.4f}")
# print(f"ROC AUC Score: {roc_auc:.4f}")

# # Correlation Score between 'Sex upon Intake' and 'Intake Condition'
# correlation_intake = x['Age upon Intake'].corr(y1_train)

# # Correlation Score between 'Sex upon Outcome' and 'Intake Condition'
# correlation_outcome = x['Age upon Outcome'].corr(y1_train)

# print(f"Correlation score between 'Age upon Intake' and 'Outcome Type': {correlation_intake:.4f}")
# print(f"Correlation score between 'Age upon Outcome' and 'Outcome Type': {correlation_outcome:.4f}")







In [None]:
# # Feature importance score for Age upon Intake and Age upon Outcome

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, roc_auc_score
# import matplotlib.pyplot as plt
# import pandas as pd

# # Define features of interest
# features_of_interest = ['Sex upon Intake', 'Sex upon Outcome']

# # Extract features and labels from the first 10,000 rows
# x = x1_train[features_of_interest]  # Select columns by passing a list of column names
# y = y1_train  # outcome type

# # Ensure y is numeric (if it's not already)
# y = pd.to_numeric(y, errors='coerce')

# # Train a Random Forest model
# rf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
# rf.fit(x, y)

# # Get feature importances
# importances = rf.feature_importances_

# # Create a DataFrame for visualization
# importance_df = pd.DataFrame({
#     'Feature': features_of_interest,
#     'Importance': importances
# }).sort_values(by='Importance', ascending=False)

# # Print the feature importance scores
# print(importance_df)

# # Visualize the importances
# plt.figure(figsize=(10, 6))
# plt.barh(importance_df['Feature'], importance_df['Importance'])
# plt.xlabel('Importance')
# plt.title('Feature Importance Scores for Selected Features')
# plt.gca().invert_yaxis()  # Highest importance at the top
# plt.show()

# # Evaluate model performance
# y_pred = rf.predict(x)  # Use x instead of X
# accuracy = accuracy_score(y, y_pred)
# roc_auc = roc_auc_score(y, rf.predict_proba(x), multi_class='ovr')  # For multi-class ROC AUC

# print(f"Accuracy: {accuracy:.4f}")
# print(f"ROC AUC Score: {roc_auc:.4f}")

# # Correlation Score between 'Sex upon Intake' and 'Intake Condition'
# correlation_intake = x['Sex upon Intake'].corr(y1_train)

# # Correlation Score between 'Sex upon Outcome' and 'Intake Condition'
# correlation_outcome = x['Sex upon Outcome'].corr(y1_train)

# print(f"Correlation score between 'Sex upon Intake' and 'Outcome Type': {correlation_intake:.4f}")
# print(f"Correlation score between 'Sex upon Outcome' and 'Outcome Type': {correlation_outcome:.4f}")


In [None]:
# Indiana Dataset Feature Scores with Random Forrest
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

indiana_data = pd.read_csv('indiana-animal-data.csv')

#indiana preprocess before feeding it to ml
def indiana_preprocess(df):
  df = df.fillna(0)
  # fix intakedate
  df['intakedate'] = pd.to_datetime(df['intakedate'])

  # Extract hour, day of the week, and month
  df['hour'] = df['intakedate'].dt.hour
  df['day'] = df['intakedate'].dt.dayofweek  # Monday=0, Sunday=6
  df['month'] = df['intakedate'].dt.month

  # Normalize time-based features using sine and cosine for cyclic representation
  df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
  df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

  df['day_sin'] = np.sin(2 * np.pi * df['day'] / 7)
  df['day_cos'] = np.cos(2 * np.pi * df['day'] / 7)

  df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
  df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

  df.drop(['intakedate', 'hour', 'day', 'month'], axis=1, inplace=True)

  # fix intakereason

  le = LabelEncoder()
  df['intakereason'] = le.fit_transform(df['intakereason'].astype(str))

  # fix sheltercode
  # Remove empty strings and convert to integer
  df['sheltercode'] = df['sheltercode'].str[1:].replace('', '0').astype(int)


  # fix identitychipnumber, brute force

  df['identichipnumber'] = 1


  # fix animalname - bruteforce everything to 1
  df['animalname'] = 1

  # fix breedname
  le = LabelEncoder()
  df['breedname'] = le.fit_transform(df['breedname'].astype(str))

  # label encoder for basecolour
  le = LabelEncoder()
  df['basecolour'] = le.fit_transform(df['basecolour'].astype(str))

  # same goes to species name
  le = LabelEncoder()
  df['speciesname'] = le.fit_transform(df['speciesname'])

  # fix age

  # Example transformation for animalage
  def convert_age(age_str):
      if isinstance(age_str, str):  # Check if the input is a string
          parts = age_str[:-1].split(' ')  # Remove the full stop and split
          if len(parts) >= 3:  # Ensure there are enough parts
              years = int(parts[0])  # Extract years
              months = int(parts[2])  # Extract months
              return years * 12 + months  # Convert to total months
      return 0  # Return 0 or another default value for invalid formats

  df['animalage'] = df['animalage'].apply(convert_age)


  # label encoder for sexname
  le = LabelEncoder()
  df['sexname'] = le.fit_transform(df['sexname'].astype(str))

  # same for location
  le = LabelEncoder()
  df['location'] = le.fit_transform(df['location'].astype(str))

  # same for movement type
  le = LabelEncoder()
  df['movementtype'] = le.fit_transform(df['movementtype'].astype(str))

  # movement date same as intakedate
  df['movementdate'] = pd.to_datetime(df['movementdate'])

  # Extract hour, day of the week, and month
  df['hour'] = df['movementdate'].dt.hour
  df['day'] = df['movementdate'].dt.dayofweek  # Monday=0, Sunday=6
  df['month'] = df['movementdate'].dt.month

  # Normalize time-based features using sine and cosine for cyclic representation
  df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
  df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

  df['day_sin'] = np.sin(2 * np.pi * df['day'] / 7)
  df['day_cos'] = np.cos(2 * np.pi * df['day'] / 7)

  df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
  df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

  df.drop(['movementdate', 'hour', 'day', 'month'], axis=1, inplace=True)



  # same for returnDate
  df['returndate'] = pd.to_datetime(df['returndate'])

  # Extract hour, day of the week, and month
  df['hour'] = df['returndate'].dt.hour
  df['day'] = df['returndate'].dt.dayofweek  # Monday=0, Sunday=6
  df['month'] = df['returndate'].dt.month

  # Normalize time-based features using sine and cosine for cyclic representation
  df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
  df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

  df['day_sin'] = np.sin(2 * np.pi * df['day'] / 7)
  df['day_cos'] = np.cos(2 * np.pi * df['day'] / 7)

  df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
  df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

  df.drop(['returndate', 'hour', 'day', 'month'], axis=1, inplace=True)

  # label encoder for returnedreason
  le = LabelEncoder()
  df['returnedreason'] = le.fit_transform(df['returnedreason'].astype(str))

  # deceased date
  df['deceaseddate'] = pd.to_datetime(df['deceaseddate'])

  # Extract hour, day of the week, and month
  df['hour'] = df['deceaseddate'].dt.hour
  df['day'] = df['deceaseddate'].dt.dayofweek  # Monday=0, Sunday=6
  df['month'] = df['deceaseddate'].dt.month

  # Normalize time-based features using sine and cosine for cyclic representation
  df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
  df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

  df['day_sin'] = np.sin(2 * np.pi * df['day'] / 7)
  df['day_cos'] = np.cos(2 * np.pi * df['day'] / 7)

  df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
  df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)


  df.drop(['deceaseddate', 'hour', 'day', 'month'], axis=1, inplace=True)

  # label encoder for deceasedreason
  le = LabelEncoder()
  df['deceasedreason'] = le.fit_transform(df['deceasedreason'].astype(str))


  return df

indiana_data = indiana_preprocess(indiana_data)

# imputer
def impute_missing_values(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            df[column].fillna(df[column].mean(), inplace=True)  # Impute with mean
        else:
            df[column].fillna(df[column].mode()[0], inplace=True)  # Impute with mode
    return df

indiana_data = impute_missing_values(indiana_data)
# print("Null Checks")
# print(indiana_data.isnull().sum())
# print("Infinity Checks")
# print(np.isinf(indiana_data).sum())


#indiana_data.to_csv('indiana_processed.csv', index=False)

x_train = indiana_data.iloc[:10000].drop(columns=['movementtype'])
y_train = indiana_data.iloc[:10000]['movementtype']

# model training and featured score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Build the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)

# Get feature importances
import matplotlib.pyplot as plt
import numpy as np

feature_importances = rf_model.feature_importances_
feature_names = x_train.columns

# Create a bar chart
plt.figure(figsize=(10, 6))
indices = np.argsort(feature_importances)[::-1]  # Sort feature importances in descending order
plt.bar(range(len(feature_importances)), feature_importances[indices], align='center')
plt.xticks(range(len(feature_importances)), feature_names[indices], rotation=90)
plt.title('Feature Indiana Importances from Random Forest')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.show()

# print(indiana_data['breedname'].value_counts())
# print(indiana_data['movementtype'].value_counts())


# Calculate the correlation score for breedname and movementtype
correlation_score = indiana_data['breedname'].astype('category').cat.codes.corr(indiana_data['movementtype'].astype('category').cat.codes)


# Print the correlation score
print("Correlation score between breedname and movementtype:", correlation_score)



In [None]:
# Merged Austin Dataset Feature Scores with Random Forrest
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

austin_data = pd.read_csv('merged_austin_data.csv')

#austin preprocess before feeding it to ml

def austin_preprocess(df):
  df = df.fillna(0)

  # Animal.id.X
  df['Animal.ID.x'] = df['Animal.ID.x'].str[1:].astype(int)

  # Name.x
  le = LabelEncoder()
  df['Name.x'] = le.fit_transform(df['Name.x'].astype(str))

  #DateTime.x
  df['DateTime.x'] = pd.to_datetime(df['DateTime.x'], format='%m/%d/%Y %I:%M:%S %p')

  #MonthYear.x
  df['MonthYear.x'] = pd.to_datetime(df['MonthYear.x'], format='%B %Y')

  # Found.Location
  le = LabelEncoder()
  df['Found.Location'] = le.fit_transform(df['Found.Location'].astype(str))

  # Intake.Type
  le = LabelEncoder()
  df['Intake.Type'] = le.fit_transform(df['Intake.Type'].astype(str))

  # Intake.Condition
  le = LabelEncoder()
  df['Intake.Condition'] = le.fit_transform(df['Intake.Condition'].astype(str))

  # Animal.Type.x
  le = LabelEncoder()
  df['Animal.Type.x'] = le.fit_transform(df['Animal.Type.x'].astype(str))

  # Sex.upon.Intake
  le = LabelEncoder()
  df['Sex.upon.Intake'] = le.fit_transform(df['Sex.upon.Intake'].astype(str))

  # Age.upon.Intake
  df['Age.upon.Intake'] = df['Age.upon.Intake'].str.split(' ').str[0].fillna('0').astype(int)

  # Breed.x
  le = LabelEncoder()
  df['Breed.x'] = le.fit_transform(df['Breed.x'].astype(str))

  # Color.x
  le = LabelEncoder()
  df['Color.x'] = le.fit_transform(df['Color.x'].astype(str))

  # DateTimeUTC.x
  df['DateTimeUTC.x'] = pd.to_datetime(df['DateTimeUTC.x'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

  # Animal.id.y
  df['Animal.ID.y'] = df['Animal.ID.y'].str[1:].astype(int)

  # Name.y
  le = LabelEncoder()
  df['Name.y'] = le.fit_transform(df['Name.y'].astype(str))

  #DateTime.y
  df['DateTime.y'] = pd.to_datetime(df['DateTime.y'], format='%m/%d/%Y %I:%M:%S %p')

  #MonthYear.y
  # Create a mapping of abbreviated month names to full names
  month_map = {
      'Jan': 'January',
      'Feb': 'February',
      'Mar': 'March',
      'Apr': 'April',
      'May': 'May',
      'Jun': 'June',
      'Jul': 'July',
      'Aug': 'August',
      'Sep': 'September',
      'Oct': 'October',
      'Nov': 'November',
      'Dec': 'December'
  }

  # For 'MonthYear.y'
  df['MonthYear.y'] = df['MonthYear.y'].str.replace(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)', lambda x: month_map[x.group(0)], regex=True)
  df['MonthYear.y'] = pd.to_datetime(df['MonthYear.y'], format='%B %Y', errors='coerce')
  df['MonthYear.y'] = df['MonthYear.y'].fillna(pd.Timestamp(0))  # Set invalid dates to 0

  # Date.of.Birth
  # Convert Date.of.Birth to datetime, setting errors='coerce' to handle invalid formats
  df['Date.of.Birth'] = pd.to_datetime(df['Date.of.Birth'], format='%m/%d/%Y', errors='coerce')

  # Replace NaT (invalid dates) with 0
  df['Date.of.Birth'] = df['Date.of.Birth'].fillna(pd.Timestamp(0))


  # Outcome.Type
  le = LabelEncoder()
  df['Outcome.Type'] = le.fit_transform(df['Outcome.Type'].astype(str))

  # Outcome.Subtype
  le = LabelEncoder()
  df['Outcome.Subtype'] = le.fit_transform(df['Outcome.Subtype'].astype(str))

  # Animal.Type.y
  le = LabelEncoder()
  df['Animal.Type.y'] = le.fit_transform(df['Animal.Type.y'].astype(str))

  # Sex.upon.Outcome
  le = LabelEncoder()
  df['Sex.upon.Outcome'] = le.fit_transform(df['Sex.upon.Outcome'].astype(str))

  # Age.upon.Outcome
  df['Age.upon.Outcome'] = df['Age.upon.Outcome'].str.split(' ').str[0].fillna('0').astype(int)

  # Breed.y
  le = LabelEncoder()
  df['Breed.y'] = le.fit_transform(df['Breed.y'].astype(str))

  # Color.y
  le = LabelEncoder()
  df['Color.y'] = le.fit_transform(df['Color.y'].astype(str))

  # DateTimeUTC.y
  df['DateTimeUTC.y'] = pd.to_datetime(df['DateTimeUTC.y'], format='%Y-%m-%d %H:%M:%S', errors='coerce')


  # Convert specified datetime columns and handle invalid entries by replacing with 0
  datetime_columns = [
      'DateTime.x', 'MonthYear.x', 'DateTimeUTC.x',
      'DateTime.y', 'MonthYear.y', 'Date.of.Birth', 'DateTimeUTC.y'
  ]

  for col in datetime_columns:
      df[col] = pd.to_datetime(df[col], errors='coerce')  # Convert to datetime, invalids become NaT
      df[col].fillna(pd.Timestamp(0), inplace=True)  # Replace NaT with 0

  df.drop(columns=datetime_columns, inplace=True)




  return df

austin_data = austin_preprocess(austin_data)

# imputer
def impute_missing_values(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            df[column].fillna(df[column].mean(), inplace=True)  # Impute with mean
        else:
            df[column].fillna(df[column].mode()[0], inplace=True)  # Impute with mode
    return df

austin_data = impute_missing_values(austin_data)
# print("Null Checks")
# print(indiana_data.isnull().sum())
# print("Infinity Checks")
# print(np.isinf(indiana_data).sum())


# austin_data.to_csv('austin_processed.csv', index=False)

x_train = austin_data.iloc[:10000].drop(columns=['Outcome.Type'])
y_train = austin_data.iloc[:10000]['Outcome.Type']

# model training and featured score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#print(x_train.dtypes)


# Build the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)

# Get feature importances
import matplotlib.pyplot as plt
import numpy as np

feature_importances = rf_model.feature_importances_
feature_names = x_train.columns

# Create a bar chart
plt.figure(figsize=(10, 6))
indices = np.argsort(feature_importances)[::-1]  # Sort feature importances in descending order
plt.bar(range(len(feature_importances)), feature_importances[indices], align='center')
plt.xticks(range(len(feature_importances)), feature_names[indices], rotation=90)
plt.title('Feature Austin Importances from Random Forest')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.show()

# print(indiana_data['breedname'].value_counts())
# print(indiana_data['movementtype'].value_counts())


# Calculate the correlation score for Breed and Outcome.Type
correlation_Breed_x = austin_data['Breed.x'].astype('category').cat.codes.corr(austin_data['Outcome.Type'].astype('category').cat.codes)
correlation_Breed_y = austin_data['Breed.y'].astype('category').cat.codes.corr(austin_data['Outcome.Type'].astype('category').cat.codes)


# Print the correlation score
print("Correlation score between Breed.x and Outcome.Type:", correlation_Breed_x)
print("Correlation score between Breed.y and Outcome.Type:", correlation_Breed_y)



In [None]:
# 25/9/2024 12.01 pm optipaw_new Preproceesing Code for ML

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

optipaw_data = pd.read_csv('optipaw_new.csv')

# Preprocessing Function

# DROP ANIMAL ID

def optipaw_preprocessing(df):

  # For Animal.ID, strip the A if it starts with A and convert it to int, else convert to int
  df['Animal.ID'] = df['Animal.ID'].apply(lambda x: int(x[1:]) if isinstance(x, str) and x.startswith('A') else int(x))

  # For Name, we will use label encoding to assign each unique name a specific int
  label_encoder = LabelEncoder()
  df['Name'] = label_encoder.fit_transform(df['Name'].astype(str))

  # For Animal.Type we will be using label encoding again
  label_encoder = LabelEncoder()
  df['Animal.Type'] = label_encoder.fit_transform(df['Animal.Type'].astype(str))

  # For Sex, we will using label encoding too
  label_encoder = LabelEncoder()
  df['Sex'] = label_encoder.fit_transform(df['Sex'].astype(str))

  # For colours, we will split into individual colours and use one hot encoding, which is assigning binary values to it

  # Split the 'Color' column by '/', 'and', and ','
  df['Color'] = df['Color'].str.replace('/', ' ').str.replace('and', ' ').str.replace(',', ' ')

  # Split the 'Color' column into a list and capitalize the first letter of each word
  df['Color'] = df['Color'].str.split().apply(lambda colors: [color.capitalize() for color in colors])

  # Create dummy variables for each unique color
  df_colors = df['Color'].str.join(' ').str.get_dummies(sep=' ')

  # Concatenate the original dataframe with the one-hot encoded color dataframe
  df = pd.concat([df, df_colors], axis=1)

  # Drop the original 'Color' column
  df = df.drop(columns=['Color'])

  # For Age, we will just store it as int and impute it with 0 if it is null, and store it as float
  df['Age'] = df['Age'].fillna(0).astype(float)  # Keep as float to handle decimals

  # For Intake.Type, we will just use label encoding
  label_encoder = LabelEncoder()
  df['Intake.Type'] = label_encoder.fit_transform(df['Intake.Type'].astype(str))

  # Same goes to Outcome.Type
  label_encoder = LabelEncoder()
  df['Outcome.Type'] = label_encoder.fit_transform(df['Outcome.Type'].astype(str))

  # For Date and Time, we will be using panda and numpy date conversion

  # Convert Intake.Date and Outcome.Date to datetime format
  df['Intake.Date'] = pd.to_datetime(df['Intake.Date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
  df['Outcome.Date'] = pd.to_datetime(df['Outcome.Date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')


  # Extract date components from the date columns
  df['Intake.Day'] = df['Intake.Date'].dt.day.fillna(0).astype(int)
  df['Intake.Month'] = df['Intake.Date'].dt.month.fillna(0).astype(int)
  df['Intake.Year'] = df['Intake.Date'].dt.year.fillna(0).astype(int)

  df['Outcome.Day'] = df['Outcome.Date'].dt.day.fillna(0).astype(int)
  df['Outcome.Month'] = df['Outcome.Date'].dt.month.fillna(0).astype(int)
  df['Outcome.Year'] = df['Outcome.Date'].dt.year.fillna(0).astype(int)

  # Extract and convert the hour to radians
  df['Intake.Hour'] = df['Intake.Date'].dt.hour.fillna(0).astype(int)
  df['Outcome.Hour'] = df['Outcome.Date'].dt.hour.fillna(0).astype(int)

  df['Intake.Hour.Radians'] = (df['Intake.Hour'] / 24) * 2 * np.pi
  df['Outcome.Hour.Radians'] = (df['Outcome.Hour'] / 24) * 2 * np.pi

  # Drop original date columns if no longer needed
  df = df.drop(columns=['Intake.Date', 'Outcome.Date'])

  return df

optipaw_data = optipaw_preprocessing(optipaw_data)

train_data = optipaw_data[optipaw_data['State'] == 'Austin'].copy()
test_data = optipaw_data[optipaw_data['State'] != 'Austin'].copy()

# Drop 'State' column from both train and test datasets
train_data = train_data.drop(columns=['State'])
test_data = test_data.drop(columns=['State'])

print(train_data.head(5))
print(test_data.head(5))


# Set options to display all unique values
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

# Check for datatype of each column
print(train_data.dtypes)
print(test_data.dtypes)

print(train_data.shape)
print(test_data.shape) I'll probably create another colour this is pretty messy yeah oh yeah

# Calculate unique values and missing values for each column
unique_values = train_data.nunique()
unique_values2 = test_data.nunique()


# missing_values = test_data.isnull().sum()
# missing_values2 = train_data.isnull().sum()

print("Train data unique:")
print(unique_values)
print("Test data unique:")
print(unique_values2)
# print(missing_values)
# print(missing_values2)

train_data.head(1000).to_csv('train.csv', index=False)
test_data.head(1000).to_csv('test.csv', index=False)


# Will delete soon, this column is to check featured scores, and to also check whether ml model takes in the processed data

x_train = train_data.iloc[:10000].drop(columns=['Outcome.Type'])
y_train = train_data.iloc[:10000]['Outcome.Type']

x_test = test_data.iloc[:10000].drop(columns=['Outcome.Type'])
y_test = test_data.iloc[:10000]['Outcome.Type']


# model training and featured score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#print(x_train.dtypes)


# Build the Random Forest model
rftrain_model = RandomForestClassifier()
rftrain_model.fit(x_train, y_train)

rftest_model = RandomForestClassifier()
rftest_model.fit(x_test, y_test)


# Feature importances Bar Chart function
import matplotlib.pyplot as plt
import numpy as np

def plot_feature_importances(rf_model, x_train, title):
    """
    Plots the feature importances from a Random Forest model.

    Parameters:
    rf_model: Trained Random Forest model
    x_train: DataFrame containing feature names
    title: Title to be included in the plot
    """
    feature_importances = rf_model.feature_importances_
    feature_names = x_train.columns

    # Create a bar chart
    plt.figure(figsize=(10, 6))
    indices = np.argsort(feature_importances)[::-1]  # Sort feature importances in descending order
    plt.bar(range(len(feature_importances)), feature_importances[indices], align='center')
    plt.xticks(range(len(feature_importances)), feature_names[indices], rotation=90)
    plt.title(f'Feature Importances from Random Forest - {title} Dataset')
    plt.xlabel('Features')
    plt.ylabel('Importance Score')
    plt.tight_layout()
    plt.show()

plot_feature_importances(rftrain_model, x_train, 'Train')
plot_feature_importances(rftest_model, x_test, 'Test')

# Building function for later (scoring)
from sklearn.metrics import accuracy_score

# Build this function
def calculate_accuracy(y_test, y_pred):
    """
    Calculate the accuracy score given the true labels and predicted labels.

    Parameters:
    - y_test: array-like, true labels
    - y_pred: array-like, predicted labels

    Returns:
    - accuracy: float, the accuracy score
    """
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

