In [1]:
# Re-import numpy as it seems it was not detected
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [7]:
# Re-running the previous steps to ensure the numpy library is recognized
amr_data = pd.read_csv("/Users/namirsacic/Desktop/Masterarbeit/data/final_AMR_data.csv")
gdp_data = pd.read_csv("/Users/namirsacic/Desktop/Masterarbeit/data/gdp_data.csv")
health_expenditure = pd.read_csv("/Users/namirsacic/Desktop/Masterarbeit/data/health_expenditure.csv")


In [8]:
# Display the first few rows and general information for each dataset
datasets_info = {
    "AMR Data": amr_data.head(),
    "AMR Data Info": amr_data.info(),
    "GDP Data": gdp_data.head(),
    "GDP Data Info": gdp_data.info(),
    "Health Expenditure Data": health_expenditure.head(),
    "Health Expenditure Info": health_expenditure.info()
}

datasets_info['AMR Data'], datasets_info['GDP Data'], datasets_info['Health Expenditure Data']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16829 entries, 0 to 16828
Data columns (total 15 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Unnamed: 0                                         16829 non-null  int64  
 1   HealthTopic                                        16829 non-null  object 
 2   Bacteria                                           16829 non-null  object 
 3   Drug_Class                                         16829 non-null  object 
 4   Year                                               16829 non-null  int64  
 5   geo                                                16829 non-null  object 
 6   RegionName                                         16829 non-null  object 
 7   Completeness age_%                                 0 non-null      float64
 8   Completeness gender_%                              0 non-null      float64
 9   I - 's

(   Unnamed: 0               HealthTopic            Bacteria       Drug_Class   
 0           0  Antimicrobial resistance  Acinetobacter spp.  Aminoglycosides  \
 1           1  Antimicrobial resistance  Acinetobacter spp.  Aminoglycosides   
 2           2  Antimicrobial resistance  Acinetobacter spp.  Aminoglycosides   
 3           3  Antimicrobial resistance  Acinetobacter spp.  Aminoglycosides   
 4           4  Antimicrobial resistance  Acinetobacter spp.  Aminoglycosides   
 
    Year geo RegionName  Completeness age_%  Completeness gender_%   
 0  2012  AT    Austria                 NaN                    NaN  \
 1  2012  BE    Belgium                 NaN                    NaN   
 2  2012  BG   Bulgaria                 NaN                    NaN   
 3  2012  CY     Cyprus                 NaN                    NaN   
 4  2012  CZ    Czechia                 NaN                    NaN   
 
    I - 'susceptible, increased exposure' isolates _N   
 0                               

In [9]:
# Merge the datasets on 'Year' and 'geo' keys again
merged_data = pd.merge(amr_data, gdp_data, on=['Year', 'geo'], how='inner')
merged_data = pd.merge(merged_data, health_expenditure, on=['Year', 'geo'], how='inner')

# Since we will drop columns with a high percentage of nulls, let's identify them
null_percentage = merged_data.isnull().mean() * 100
columns_to_drop = null_percentage[null_percentage > 50].index.tolist()

# Drop columns with more than 50% missing values
merged_data_clean = merged_data.drop(columns=columns_to_drop)

# Convert percentages from strings to numeric and handle missing values
percent_columns = [col for col in merged_data_clean.columns if '%' in col]
for col in percent_columns:
    merged_data_clean[col] = pd.to_numeric(
        merged_data_clean[col].str.replace('%', ''), errors='coerce'
    )

# Fill remaining missing values with median for numeric columns
numeric_columns = merged_data_clean.select_dtypes(include=[np.number]).columns
merged_data_clean[numeric_columns] = merged_data_clean[numeric_columns].fillna(merged_data_clean[numeric_columns].median())

# Check the first few rows of the cleaned dataset and show information
merged_data_clean_info = merged_data_clean.info()
merged_data_clean_head = merged_data_clean.head()

merged_data_clean_head, merged_data_clean_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9060 entries, 0 to 9059
Data columns (total 24 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Unnamed: 0_x                                       9060 non-null   int64  
 1   HealthTopic                                        9060 non-null   object 
 2   Bacteria                                           9060 non-null   object 
 3   Drug_Class                                         9060 non-null   object 
 4   Year                                               9060 non-null   int64  
 5   geo                                                9060 non-null   object 
 6   RegionName                                         9060 non-null   object 
 7   I - 'susceptible, increased exposure' isolates _N  9060 non-null   float64
 8   R - resistant isolates_N                           9060 non-null   float64
 9   R - resi

(   Unnamed: 0_x               HealthTopic               Bacteria   
 0             0  Antimicrobial resistance     Acinetobacter spp.  \
 1           328  Antimicrobial resistance     Acinetobacter spp.   
 2           656  Antimicrobial resistance     Acinetobacter spp.   
 3          1312  Antimicrobial resistance     Acinetobacter spp.   
 4          1971  Antimicrobial resistance  Enterococcus faecalis   
 
                                           Drug_Class  Year geo RegionName   
 0                                    Aminoglycosides  2012  AT    Austria  \
 1                                        Carbapenems  2012  AT    Austria   
 2  Combined resistance (fluoroquinolones, aminogl...  2012  AT    Austria   
 3                                   Fluoroquinolones  2012  AT    Austria   
 4                                   Aminopenicillins  2012  AT    Austria   
 
    I - 'susceptible, increased exposure' isolates _N   
 0                                                0.0  \


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming merged_data_clean is already loaded and available

# List of bacteria to iterate through
bacteria_list = merged_data_clean['Bacteria'].unique()

# Define a DataFrame to store all results
results_df = pd.DataFrame()

for bacteria in bacteria_list:
    print(f"Processing {bacteria}...")
    data = merged_data_clean[merged_data_clean['Bacteria'] == bacteria].sort_values(by='Year')  # Sort by your time column
    scaler = StandardScaler()
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    data_scaled = scaler.fit_transform(data[numeric_columns])

    pca = PCA(n_components=0.9)
    data_pca = pca.fit_transform(data_scaled)

    optimal_clusters = np.argmax([silhouette_score(data_pca, KMeans(n_clusters=k, random_state=42).fit_predict(data_pca)) for k in range(2, 11)]) + 2
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
    clusters = kmeans.fit_predict(data_pca)
    data['Cluster'] = clusters

    # Splitting the data into training and testing sets
    train_index = int(len(data) * 0.8)  # 80% for training
    X_train, X_test = data.iloc[:train_index][numeric_columns], data.iloc[train_index:][numeric_columns]
    y_train, y_test = data.iloc[:train_index]['Cluster'], data.iloc[train_index:]['Cluster']

    models = {
        'Linear Regression': LinearRegression(),
        'Lasso Regression': Lasso(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'SVM': SVR(),
        'XGBoost': XGBRegressor(objective='reg:squarederror'),
        'Polynomial Regression': LinearRegression()  # We'll create polynomial features below
    }

    for name, model in models.items():
        if name == 'Polynomial Regression':
            poly = PolynomialFeatures(degree=2)
            X_train_poly = poly.fit_transform(X_train)
            X_test_poly = poly.transform(X_test)
            model.fit(X_train_poly, y_train)
            predictions_train = model.predict(X_train_poly)
            predictions_test = model.predict(X_test_poly)
        else:
            model.fit(X_train, y_train)
            predictions_train = model.predict(X_train)
            predictions_test = model.predict(X_test)

        # Calculating metrics for both training and test sets
        r2_train = r2_score(y_train, predictions_train)
        r2_test = r2_score(y_test, predictions_test)
        mse_train = mean_squared_error(y_train, predictions_train)
        mse_test = mean_squared_error(y_test, predictions_test)
        mae_train = mean_absolute_error(y_train, predictions_train)
        mae_test = mean_absolute_error(y_test, predictions_test)

        # Appending results
        results_df = pd.concat([results_df, pd.DataFrame({
            'Bacteria': [bacteria],
            'Model': [name],
            'Optimal PCA Components': [pca.n_components_],
            'Optimal Clusters': [optimal_clusters],
            'R² Train': [r2_train],
            'R² Test': [r2_test],
            'MSE Train': [mse_train],
            'MSE Test': [mse_test],
            'MAE Train': [mae_train],
            'MAE Test': [mae_test]
        })], ignore_index=True)

results_df.to_csv('model_comparison_results.csv', index=False)
print(results_df)


Processing Acinetobacter spp....


  model = cd_fast.enet_coordinate_descent(


Processing Enterococcus faecalis...


  model = cd_fast.enet_coordinate_descent(


Processing Enterococcus faecium...


  model = cd_fast.enet_coordinate_descent(


Processing Escherichia coli...


  model = cd_fast.enet_coordinate_descent(


Processing Klebsiella pneumoniae...


  model = cd_fast.enet_coordinate_descent(


Processing Pseudomonas aeruginosa...


  model = cd_fast.enet_coordinate_descent(


Processing Staphylococcus aureus...


  model = cd_fast.enet_coordinate_descent(


Processing Streptococcus pneumoniae...


  model = cd_fast.enet_coordinate_descent(


                    Bacteria                  Model  Optimal PCA Components   
0         Acinetobacter spp.      Linear Regression                       9  \
1         Acinetobacter spp.       Lasso Regression                       9   
2         Acinetobacter spp.          Random Forest                       9   
3         Acinetobacter spp.                    SVM                       9   
4         Acinetobacter spp.                XGBoost                       9   
5         Acinetobacter spp.  Polynomial Regression                       9   
6      Enterococcus faecalis      Linear Regression                       9   
7      Enterococcus faecalis       Lasso Regression                       9   
8      Enterococcus faecalis          Random Forest                       9   
9      Enterococcus faecalis                    SVM                       9   
10     Enterococcus faecalis                XGBoost                       9   
11     Enterococcus faecalis  Polynomial Regression 