In [12]:
import pandas as pd


file_path = r'C:\Users\nlpsw\OneDrive\Documents\datasetss.csv'


df = pd.read_csv(file_path)

print(df.head())

print(df.describe())

print(df[['age', 'weight(kg)']])

average_weight = df['weight(kg)'].mean()
print(f'Average weight: {average_weight}')

num_smokers = df['smoking'].sum()
print(f'Number of smokers: {num_smokers}')

caries_patients = df[df['dental caries'] == 1]
print(f'Patients with dental caries: {len(caries_patients)}')

correlation_matrix = df.corr()
print(correlation_matrix)


   age  weight(kg)  systolic  relaxation  fasting blood sugar  Cholesterol  \
0   35          85       118          78                   97          239   
1   20         110       119          79                   88          211   
2   45          65       110          80                   80          193   
3   45          80       158          88                  249          210   
4   20          60       109          64                  100          179   

   triglyceride  HDL  LDL  hemoglobin  Urine protein  serum creatinine   AST  \
0           153   70  142        19.8              1               1.0    61   
1           128   71  114        15.9              1               1.1    19   
2           120   57  112        13.7              3               0.6  1090   
3           366   46   91        16.9              1               0.9    32   
4           200   47   92        14.9              1               1.2    26   

    ALT  Gtp  smoking  dental caries  
0   115  12

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset into a pandas DataFrame
file_path = r"C:\Users\nlpsw\OneDrive\Documents\datasetss.csv"
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['dental caries'])  # Features
y = df['dental caries']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numerical and categorical data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model: {accuracy:.2f}')


Accuracy of the model: 0.83


In [16]:


new_patient_data = {
    'age': 35,
    'weight(kg)': 70,
    'systolic': 120,
    'relaxation': 80,
    'fasting blood sugar': 90,
    'Cholesterol': 200,
    'triglyceride': 150,
    'HDL': 50,
    'LDL': 130,
    'hemoglobin': 14,
    'Urine protein': 0.2,
    'serum creatinine': 1.0,
    'AST': 25,
    'ALT': 30,
    'Gtp': 35,
    'smoking':1
}


new_patient_df = pd.DataFrame([new_patient_data])
prediction = model.predict(new_patient_df)

if prediction[0] == 1:
    print("Patient is predicted to have dental caries.")
else:
    print("Patient is predicted not to have dental caries.")

Patient is predicted not to have dental caries.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset into a pandas DataFrame
file_path = r"C:\Users\nlpsw\OneDrive\Documents\datasetss.csv"
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['dental caries'])  # Features
y = df['dental caries']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define preprocessing steps for numerical and categorical data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Accuracy: 0.81
Precision: 0.92
Recall: 0.15
F1 Score: 0.25


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset into a pandas DataFrame
file_path = r"C:\Users\nlpsw\OneDrive\Documents\datasetss.csv"
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['target_column'])  # Features
y = df['target_column']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numerical and categorical data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline with hyperparameter tuning
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best negative mean squared error found: ", grid_search.best_score_)

# Evaluate the best model on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse:.2f}')


In [10]:
# Train the model (assuming you have already trained it)
model.fit(X_train, y_train)

# Predict probabilities for test data
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of class 1 (positive class)

# Adjust classification threshold (e.g., lowering it to 0.3)
threshold = 0.5
y_pred_adjusted = (y_pred_proba > threshold).astype(int)

# Evaluate the model with adjusted threshold
accuracy = accuracy_score(y_test, y_pred_adjusted)
precision = precision_score(y_test, y_pred_adjusted)
recall = recall_score(y_test, y_pred_adjusted)
f1 = f1_score(y_test, y_pred_adjusted)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Accuracy: 0.83
Precision: 0.96
Recall: 0.21
F1 Score: 0.35


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[3036    7]
 [ 648  208]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      1.00      0.90      3043
           1       0.97      0.24      0.39       856

    accuracy                           0.83      3899
   macro avg       0.90      0.62      0.65      3899
weighted avg       0.86      0.83      0.79      3899



In [21]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# Initialize SVM classifier
svm_model = SVC(kernel='rbf', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))


Confusion Matrix:
 [[3043    0]
 [ 856    0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.88      3043
           1       0.00      0.00      0.00       856

    accuracy                           0.78      3899
   macro avg       0.39      0.50      0.44      3899
weighted avg       0.61      0.78      0.68      3899



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
import pandas as pd


file_path = r'C:\Users\nlpsw\OneDrive\Documents\datasetss.csv'


df = pd.read_csv(file_path)

print(df.head())

print(df.describe())
total_records = len(df)
print(f'Total number of records: {total_records}')

   age  weight(kg)  systolic  relaxation  fasting blood sugar  Cholesterol  \
0   35          85       118          78                   97          239   
1   20         110       119          79                   88          211   
2   45          65       110          80                   80          193   
3   45          80       158          88                  249          210   
4   20          60       109          64                  100          179   

   triglyceride  HDL  LDL  hemoglobin  Urine protein  serum creatinine   AST  \
0           153   70  142        19.8              1               1.0    61   
1           128   71  114        15.9              1               1.1    19   
2           120   57  112        13.7              3               0.6  1090   
3           366   46   91        16.9              1               0.9    32   
4           200   47   92        14.9              1               1.2    26   

    ALT  Gtp  smoking  dental caries  
0   115  12