In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [4]:
data_path = '/content/drive/MyDrive/HCC-TACE-Seg_clinical_data-V2.xlsx'
df = pd.read_excel(data_path)
df_cleaned = df.dropna()
numerical_stats = df.describe()

In [16]:
save_dir = '/content/drive/MyDrive/'
os.makedirs(save_dir, exist_ok=True)

In [17]:
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    plt.figure(figsize=(12, 8))
    df[col].hist(bins=20)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
    save_path = os.path.join(save_dir, f'hist_{col}.png')
    plt.savefig(save_path)
    plt.close()

In [18]:
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=col, data=df)
    plt.title(f'Countplot of {col}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    save_path = os.path.join(save_dir, f'countplot_{col}.png')
    plt.savefig(save_path)
    plt.close()

In [19]:
correlation_matrix = df[numerical_columns].corr()

plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='viridis')
plt.title('Correlation Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
save_path = os.path.join(save_dir, 'correlation_matrix_heatmap.png')
plt.savefig(save_path)
plt.close()

In [20]:
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=col, data=df, hue='Death_1_StillAliveorLostToFU_0')
    plt.title(f'Countplot of {col} by Death')
    plt.xticks(rotation=45)
    plt.tight_layout()
    save_path = os.path.join(save_dir, f'countplot_{col}_by_death.png')
    plt.savefig(save_path)
    plt.close()

In [22]:
grouped = df.groupby('Sex')['TTP'].mean()

plt.figure(figsize=(8, 6))
grouped.plot(kind='bar', color=['blue', 'orange'])
plt.title('Mean TTP by Gender')
plt.xlabel('Sex')
plt.ylabel('Mean TTP')
plt.xticks(rotation=0)
plt.tight_layout()
save_path = os.path.join(save_dir, 'mean_ttp_by_gender.png')
plt.savefig(save_path)
plt.close()

In [None]:
features = df[['age', 'Sex', 'Smoking', 'Alcohol']]
target = df['Death_1_StillAliveorLostToFU_0']

label_encoder = LabelEncoder()
features['Sex'] = label_encoder.fit_transform(features['Sex'])
features['Smoking'] = label_encoder.fit_transform(features['Smoking'])
features['Alcohol'] = label_encoder.fit_transform(features['Alcohol'])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the model:", accuracy)

model_save_path = os.path.join(save_dir, 'logistic_regression_model.pkl')
pd.to_pickle(model, model_save_path)

In [32]:
features_encoded = pd.get_dummies(features, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
model_save_path = os.path.join(save_dir, 'logistic_regression_model.pkl')
joblib.dump(model, model_save_path)

In [34]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.81
