In [None]:
!pip install kneed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import scipy.stats as stats
from scipy.stats import shapiro , kstest , anderson , normaltest
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from kneed import KneeLocator
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold, cross_val_score , GridSearchCV ,train_test_split
from sklearn.metrics import make_scorer, f1_score ,classification_report, accuracy_score, confusion_matrix
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam



In [None]:
from google.colab import files
uploaded = files.upload()

df=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.head()
df.info()

In [None]:
#df.drop('customerID',axis=1,inplace=True)
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()
df = df.fillna(df["TotalCharges"].mean())
duplicates = df[df.duplicated()]
print(duplicates)



In [None]:
#df['CorrectTotalCharges'] = df['tenure'] * df['MonthlyCharges']

# Identify rows where TotalCharges is incorrect
#incorrect_total_charges = df[df['TotalCharges'] != df['CorrectTotalCharges']]
#incorrect_total_charges

In [None]:
print("data shape : ",df.shape)
print("Missing value :\n ",df.isnull().sum())
print(f"Total number of samples are {df.shape[0]}")
print(f"Total number of features are {df.shape[1]}")

In [None]:
df.describe(include=['int', 'float']).round(2)

In [None]:
df.describe(include=['object'])

In [None]:
df.isnull().sum()

In [None]:
#df = df.fillna(df["TotalCharges"].mean())

In [None]:
df['SeniorCitizen'].replace({0: 'No', 1: 'Yes'}, inplace=True)
df['SeniorCitizen'].value_counts()

In [None]:
#df['customerID'].duplicated().sum()

In [None]:
df["Internet service"] = df["OnlineSecurity"] != "No internet service"
df_encoded = pd.get_dummies(df, columns=['MultipleLines'])
df_encoded = pd.get_dummies(df_encoded, columns=['InternetService'])
df_encoded = pd.get_dummies(df_encoded, columns=['OnlineSecurity'])
df_encoded = pd.get_dummies(df_encoded, columns=['PaymentMethod'])
#df_encoded = pd.get_dummies(df_encoded, columns=['Contract'])
print(df_encoded)

In [None]:
'''
label_encoder = LabelEncoder()
df['Contract_encoded'] = label_encoder.fit_transform(df['Contract'])
print(df)
'''

In [None]:
#df_encoded = df_encoded.sparse.to_dense()
df_encoded.replace({'Yes': 1, 'No': 0 , True:1 , False:0 , "Male":1 , "Female":0 , "No internet service":0 , "Month-to-month":1 , "One year":12 , "Two year":24}, inplace=True)
print(df_encoded)

In [None]:
print(df_encoded.value_counts())

In [None]:
for column in df_encoded.columns:
    if df_encoded[column].dtype == 'object':
        df_encoded[column] = pd.to_numeric(df_encoded[column], errors='coerce')

df_encoded["Internet service"] = pd.to_numeric(df_encoded["Internet service"], errors='coerce')
print(df_encoded.dtypes)

In [None]:
column_to_visualize = ["MonthlyCharges" , "TotalCharges" , "tenure"]

for column in column_to_visualize:
    plt.hist(df_encoded[column] , bins = 60 , edgecolor = "black")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.title(f"Distribution of {column}")
    plt.show()
    plt.boxplot(df_encoded[column])
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.title(f"Distribution of {column}")
    plt.show()

In [None]:
for column in column_to_visualize:
  stat, p = shapiro(df_encoded[column])
  print('Statistics=%.3f, p=%.3f' % (stat, p))
  if p > 0.05:
      print(f'according to shapiro test the data looks normally distributed for {column}')
  else:
      print(f'ccording to shapiro test the data does not look normally distributed for {column}')


In [None]:
for column in column_to_visualize:
  stat, p = kstest(df_encoded[column] , "norm")
  if p > 0.05:
      print(f'according to kstest test the data looks normally distributed for {column}')
  else:
      print(f'ccording to kstest test the data does not look normally distributed for {column}')

In [None]:
plt.scatter(df_encoded['MonthlyCharges'] , df_encoded['TotalCharges'] , label = "Monthly and total charges")
plt.legend()
plt.show()
plt.scatter(df_encoded['MonthlyCharges'] , df_encoded['tenure'] , label = "Monthly and tenure")
plt.legend()
plt.show()
plt.scatter(df_encoded['TotalCharges'] , df_encoded['tenure'] , label = "Total charges and tenure")
plt.legend()
plt.show()

In [None]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_encoded[column_to_visualize])
df_scaled = pd.DataFrame(df_scaled)


In [None]:

k = 5
neighbors = NearestNeighbors(n_neighbors = k)
neighbors_fit = neighbors.fit(df_scaled)
distances, indices = neighbors_fit.kneighbors(df_scaled)

distances = np.sort(distances[:, k - 1])
kneedle = KneeLocator(np.arange(len(distances)) , distances, curve='convex', direction='increasing')

plt.plot(distances)
plt.title("K-Distance Graph")
plt.xlabel("Data Points Sorted by Distance")
plt.ylabel("Distance to 7th Nearest Neighbor")
plt.show()

In [None]:
print(f"The elbow is located at sample: {kneedle.elbow}")

In [None]:
db = DBSCAN(eps=0.0375, min_samples=10).fit(df_scaled)
labels = db.labels_
outliers = df_scaled[labels == -1]
print("Outliers detected by DBSCAN:")
print(len(outliers))

In [None]:
df_scaled = pd.DataFrame(df_scaled)
for column in column_to_visualize:
    Q1 = df_encoded[column].quantile(0.25)
    Q3 = df_encoded[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df_encoded[column] < lower_bound) | (df_encoded[column] > upper_bound)]
    print(f"Outliers in {column}:")
    print(outliers)

In [None]:
isolation_forest = IsolationForest(contamination=0.02)
isolation_forest.fit(df_scaled)
outliers = df_scaled[isolation_forest.predict(df_scaled) == -1]

In [None]:
outliers.shape

In [None]:
df_encoded.head()
df_encoded.isna().isna().sum()


In [None]:
corr = df_encoded.corr()['Churn'].sort_values(ascending=False)
print(corr)

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = df_encoded.drop(columns=['Churn'])
y = df_encoded['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Check feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_importances)

In [None]:
Engineered_df = df_encoded.copy(deep = True)
Engineered_df['num_services'] = (df_encoded[['PhoneService', 'MultipleLines_Yes', 'OnlineSecurity_Yes', 'OnlineBackup', 'DeviceProtection',
                                          'TechSupport', 'StreamingTV', 'StreamingMovies']] == 1 ).sum(axis=1)

Engineered_df['charges_ratio'] = (pd.to_numeric(df_encoded['TotalCharges'], errors='coerce') / df_encoded['MonthlyCharges'])
Engineered_df['charges_ratio'] = (Engineered_df['charges_ratio'].fillna(0)).round(2)

Engineered_df['contract_churn_interaction'] = df_encoded['Contract'].astype(str) + "_" + df_encoded['Churn'].astype(str)#I donit Know if this good

Engineered_df['senior_internet'] = df_encoded.apply(lambda x: 'Yes' if x['SeniorCitizen'] == 1
                                              and x['InternetService_No'] != 'Yes' else 'No', axis=1)
Engineered_df["senior_internet"].replace({"Yes": 1, "No": 0}, inplace=True)

Engineered_df["Customer_score"] = (df_encoded["tenure"] * df_encoded["MonthlyCharges"] * 0.5 + df_encoded["tenure"] * df_encoded["TotalCharges"] * 0.5)
Engineered_df["Customer_score"].round(2)
Engineered_df["Tech_savvy"] = (
    (df_encoded["OnlineBackup"] ) +
    (df_encoded["DeviceProtection"]) +
    (df_encoded["TechSupport"]).astype(int))
Engineered_df["LoyaltyScore"] = (Engineered_df["tenure"] * Engineered_df["Contract"] * Engineered_df["TotalCharges"]).rank(pct=True).round(2)



# Calculate 75th percentiles
monthly_charges_75th = df_encoded['MonthlyCharges'].quantile(0.75)
total_charges_75th = df_encoded['TotalCharges'].quantile(0.75)


Engineered_df['HighValueCustomer'] = (
    (df_encoded['MonthlyCharges'] >= monthly_charges_75th) |
    (df_encoded['TotalCharges'] >= total_charges_75th)
).astype(int).rank(pct=True)


#NEW
Engineered_df['tenure_ratio'] = Engineered_df['tenure'] / Engineered_df['MonthlyCharges']
Engineered_df['customer_loyalty_interaction'] = Engineered_df['Customer_score'] * Engineered_df['LoyaltyScore']
Engineered_df['tech_loyalty_interaction'] = Engineered_df['Tech_savvy'] * Engineered_df['LoyaltyScore']
Engineered_df['high_value_loyalty_interaction'] = Engineered_df['HighValueCustomer'] * Engineered_df['LoyaltyScore']
Engineered_df['tech_high_value_interaction'] = Engineered_df['Tech_savvy'] * Engineered_df['HighValueCustomer']





len(Engineered_df.columns.to_list())

In [None]:
Engineered_df.head(100)

In [None]:
Engineered_df["Tech_savvy"].value_counts()

In [None]:
sns.countplot(data=Engineered_df, x='senior_internet', hue='Churn')


plt.title('Distribution of Senior Citizens with Internet Service vs Churn')
plt.xlabel('Senior Citizen with Internet Service')
plt.ylabel('Count')


plt.show()

In [None]:

plt.hist(Engineered_df["Churn"] , bins = 60 , edgecolor = "black")
plt.xlabel('Churn')
plt.ylabel("Contract")
plt.title(f"Churn and Contract")
plt.show()

In [None]:
print("the Mean for num_services : ",Engineered_df['num_services'].mean().round(2))
print("the Median for num_services : ",Engineered_df['num_services'].median())
print("the Range for num_services : ",Engineered_df['num_services'].max()-Engineered_df['num_services'].min())

print("\nthe Mean for charges_ratio : ",Engineered_df['charges_ratio'].mean().round(2))
print("the Median for charges_ratio : ",Engineered_df['charges_ratio'].median())
print("the Range for charges_ratio : ",Engineered_df['charges_ratio'].max()-Engineered_df['charges_ratio'].min())

In [None]:

sns.set(style="whitegrid")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Distributions and Churn Analysis', fontsize=16)

sns.histplot(Engineered_df, x='MonthlyCharges', kde=True, hue='Churn', ax=axes[0, 0])
axes[0, 0].set_title('Monthly Charges Distribution by Churn')
axes[0, 0].set_xlabel('Monthly Charges')

sns.histplot(Engineered_df, x='TotalCharges', kde=True, hue='Churn', ax=axes[0, 1])
axes[0, 1].set_title('Total Charges Distribution by Churn')
axes[0, 1].set_xlabel('Total Charges')

sns.histplot(Engineered_df, x='num_services', kde=False, hue='Churn', multiple="dodge", ax=axes[1, 0])
axes[1, 0].set_title('Number of Services Distribution by Churn')
axes[1, 0].set_xlabel('Number of Services')

sns.boxplot(Engineered_df, x='Churn', y='charges_ratio', ax=axes[1, 1])
axes[1, 1].set_title('Charges Ratio by Churn')
axes[1, 1].set_xlabel('Churn')
axes[1, 1].set_ylabel('Charges Ratio')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


**1. Monthly Charges Distribution by Churn (Top Left)**
Customers with higher monthly charges tend to churn more frequently than those with lower charges.
Offering plans or promotions targeting high-spending customers could reduce churn, as they may feel dissatisfied with the value they receive.

**2. Total Charges Distribution by Churn (Top Right)**
Customers who leave tend to have lower total charges overall, possibly indicating that churned customers are either newer customers or less engaged over time.
A focus on engagement strategies early in the customer lifecycle might help retain these customers.

**3. Number of Services Distribution by Churn (Bottom Left)**
Customers using fewer services are more likely to churn. This suggests that bundling services together could increase retention.
Encouraging customers to adopt multiple services may create stickiness and reduce churn rates.

**4. Charges Ratio by Churn (Bottom Right)**
A higher charges ratio may correlate with longer-term or more engaged customers. Customers with lower ratios (possibly indicating limited usage or engagement) are more likely to churn.
Offering targeted engagement strategies to customers with lower usage ratios could help reduce churn.




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# إعداد المخططات
plt.figure(figsize=(16, 12))

# Customer_score Boxplot
plt.subplot(2, 2, 1)
sns.boxplot(x='Churn', y='Customer_score', data=Engineered_df)
plt.title('Customer Score vs Churn')

# Tech_savvy Boxplot
plt.subplot(2, 2, 2)
sns.boxplot(x='Churn', y='Tech_savvy', data=Engineered_df)
plt.title('Tech Savvy vs Churn')

# LoyaltyScore Boxplot
plt.subplot(2, 2, 3)
sns.boxplot(x='Churn', y='LoyaltyScore', data=Engineered_df)
plt.title('Loyalty Score vs Churn')

# HighValueCustomer Boxplot
plt.subplot(2, 2, 4)
sns.boxplot(x='Churn', y='HighValueCustomer', data=Engineered_df)
plt.title('High Value Customer vs Churn')

# عرض المخططات
plt.tight_layout()
plt.show()


**1. Customer Score vs Churn (Top Left):**
A higher customer score seems to correlate with customers not churning.
However, the presence of outliers suggests that even high-scoring customers can churn, which could indicate external factors (e.g., service issues or competition) influencing their behavior.

**2. Tech Savvy vs Churn (Top Right)**
Tech-savviness seems to be a factor in retention. Customers who are more tech-savvy (comfortable with digital services) are less likely to churn.
Less tech-savvy customers may struggle with or avoid engaging with services, leading to a higher likelihood of churn.

**3. Loyalty Score vs Churn (Bottom Left)**
Loyalty scores have a significant impact on churn. A higher loyalty score suggests greater engagement and satisfaction, which helps retain customers.
The clear gap in the distributions between churned and non-churned groups indicates that loyalty is a critical factor in customer retention.

**4. High Value Customer vs Churn (Bottom Right)**
Being a high-value customer does not seem to influence churn significantly. This could mean that even high-value customers may churn if other issues (e.g., poor service or lack of engagement) are present.
This finding emphasizes that retention efforts should focus on other factors like loyalty and customer experience, not just monetary value.




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Calculate the correlation matrix
correlation_matrix = Engineered_df.corr()

# Strong Correlation Filtered Heatmap (only shows |r| > 0.5)
strong_corr_matrix = correlation_matrix.copy()
strong_corr_matrix[np.abs(strong_corr_matrix) < 0.5] = 0  # Mask weak correlations

plt.figure(figsize=(20, 20))
sns.heatmap(strong_corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title('Strong Correlations (|r| > 0.5)')
plt.show()


In [None]:
correlation_matrix = Engineered_df.corr()
features_of_interest = ['MonthlyCharges', 'InternetService_Fiber optic','TotalCharges','InternetService_No' ,'Tech_savvy', 'LoyaltyScore',
                        'HighValueCustomer','tenure','StreamingTV']

selected_corr_matrix = correlation_matrix.loc[features_of_interest, features_of_interest]

selected_corr_matrix[np.abs(selected_corr_matrix) < 0.5] = 0  # Mask weak correlations


plt.figure(figsize=(8, 6))
sns.heatmap(selected_corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix for Selected Features')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# إعداد مخطط الانتشار مع ألوان مختلفة واضحة
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=Engineered_df,
    x='TotalCharges',
    y='LoyaltyScore',
    hue='Churn',
    palette={0: 'blue', 1: 'red'},  # تعيين الأزرق للعملاء غير المغادرين والأحمر للعملاء المغادرين
    alpha=0.6
)

plt.title('Relationship between Total Charges and Loyalty Score by Churn')
plt.xlabel('Total Charges')
plt.ylabel('Loyalty Score')
plt.legend(title='Churn', labels=['No Churn', 'Churn'])
plt.show()


This plot suggests that a higher loyalty score and total charges are generally indicative of customer retention, while lower loyalty scores are more common among customers who churn. Further analysis could explore these clusters or segments to understand what drives loyalty and retention.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=Engineered_df,
    x='TotalCharges',
    y='tenure',
    hue='Churn',
    palette={0: 'blue', 1: 'red'},
    alpha=0.6
)

plt.title('Relationship between Total Charges and tenure by Churn')
plt.xlabel('Total Charges')
plt.ylabel('tenure')
plt.legend(title='Churn', labels=['No Churn', 'Churn'])
plt.show()


**Churn (Red)**: Customers who churn tend to have shorter tenures, mostly concentrated within the lower tenure range (0–20 months). This suggests that new or short-term customers are more likely to leave.

**No Churn (Blue)**: Customers who do not churn span across the entire tenure range, with a heavy concentration towards higher tenures (20–70 months). This implies that retention improves as customers stay longer with the company.

In summary, this plot suggests that tenure is a strong indicator of customer loyalty, and most churn happens early in the customer lifecycle. Efforts to increase retention may be most effective when directed at customers with low tenure.








In [None]:
Engineered_df.drop(columns=['customerID'] , inplace=True)
#df_encoded.drop(columns=['customerID'] , inplace=True)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming df_encoded is your DataFrame
X = Engineered_df.drop(columns=['Churn'])
y = Engineered_df['Churn']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Specify columns to scale
columns_to_scale = ['MonthlyCharges', 'TotalCharges', 'tenure', 'charges_ratio']  # Specify the correct column names

# Create a scaler
scaler = StandardScaler()

# Fit and transform only the specified columns in the training set
X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

# Transform only the specified columns in the test set
X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Use GridSearchCV to search for the best parameters
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the model to the training data
grid_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_rf_model = grid_search.best_estimator_

# Make predictions
y_pred = best_rf_model.predict(X_test_scaled)

# Print results
print("Best Parameters from Grid Search:", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)
print("Accuracy Score on Test Data:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming Engineered_df is your DataFrame
X = Engineered_df.drop(columns=['Churn'])
y = Engineered_df['Churn']

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the training set to create a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Specify columns to scale
columns_to_scale = ['MonthlyCharges', "TotalCharges" , "Customer_score", "charges_ratio"]# , "customer_usefulness"]

# Create a scaler
scaler = MinMaxScaler()
scaler2 = StandardScaler()

# Fit and transform only the specified columns in the training set
X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])


# Transform only the specified columns in the validation and test sets
X_val_scaled = X_val.copy()
X_val_scaled[columns_to_scale] = scaler.transform(X_val[columns_to_scale])


X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])


# Print shapes
print("X_train shape:", X_train_scaled.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val_scaled.shape)
print("y_val shape:", y_val.shape)

# Create and fit the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val_scaled)

# Print validation results
print("Validation Accuracy Score:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set
y_test_pred = model.predict(X_test_scaled)

# Print test results
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))


In [None]:
##NEW

features = ['Customer_score', 'Tech_savvy', 'LoyaltyScore', 'HighValueCustomer']
X = Engineered_df[features]
y = Engineered_df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

#model = LogisticRegression(class_weight='balanced')


print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


In [None]:
##NEW
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# اختيار الميزات الجديدة والقديمة
features = [
    'Customer_score',
    'Tech_savvy',
    'LoyaltyScore',
    'HighValueCustomer',
    'tenure_ratio',
    'customer_loyalty_interaction',
    'tech_loyalty_interaction',
    'high_value_loyalty_interaction',
    'tech_high_value_interaction'
]

# إعداد X و y
X = Engineered_df[features]
y = Engineered_df['Churn']

# تقسيم البيانات إلى مجموعة تدريب واختبار
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# إنشاء وتدريب نموذج Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# التنبؤ واختبار النموذج
y_pred = model.predict(X_test)

# تقييم النموذج
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


In [None]:
##NEW

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping


features = [
    'Customer_score',
    'Tech_savvy',
    'LoyaltyScore',
    'HighValueCustomer',
    'tenure_ratio',
    'customer_loyalty_interaction',
    'tech_loyalty_interaction',
    'high_value_loyalty_interaction',
    'tech_high_value_interaction'
]

X = Engineered_df[features]
y = Engineered_df['Churn']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=10, callbacks=[early_stopping])

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming Engineered_df is your DataFrame
X = Engineered_df.drop(columns=['Churn'])
y = Engineered_df['Churn']

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the training set to create a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Specify columns to scale
columns_to_scale = ['MonthlyCharges', "TotalCharges", "Customer_score", "charges_ratio"]

# Create a scaler
scaler = MinMaxScaler()

# Fit and transform only the specified columns in the training set
X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

# Transform only the specified columns in the validation and test sets
X_val_scaled = X_val.copy()
X_val_scaled[columns_to_scale] = scaler.transform(X_val[columns_to_scale])

X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# Print shapes
print("X_train shape:", X_train_scaled.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val_scaled.shape)
print("y_val shape:", y_val.shape)

# Set up hyperparameter grid for tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],  # Optimization algorithms
    'penalty': ['l1', 'l2', 'elasticnet', None]  # Regularization techniques
}

# Create Grid Search
grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the validation set using the best model
y_val_pred = best_model.predict(X_val_scaled)

# Print validation results
print("Validation Accuracy Score:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set using the best model
y_test_pred = best_model.predict(X_test_scaled)

# Print test results
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Print best parameters
print("Best Parameters:", grid_search.best_params_)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming Engineered_df is your DataFrame
X = Engineered_df.drop(columns=['Churn'])
y = Engineered_df['Churn']

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the training set to create a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Specify columns to scale
columns_to_scale = ['MonthlyCharges', "TotalCharges", "Customer_score", "charges_ratio"]

# Create a scaler
scaler = MinMaxScaler()

# Fit and transform only the specified columns in the training set
X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

# Transform only the specified columns in the validation and test sets
X_val_scaled = X_val.copy()
X_val_scaled[columns_to_scale] = scaler.transform(X_val[columns_to_scale])

X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# Print shapes
print("X_train shape:", X_train_scaled.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val_scaled.shape)
print("y_val shape:", y_val.shape)

# Set up hyperparameter grid for tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'l1_ratio': [0.5]  # Example value; adjust as needed
}

# Create Grid Search
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Evaluate on training set
y_train_pred = best_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val_scaled)

# Print validation results
print("Validation Accuracy Score:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set
y_test_pred = best_model.predict(X_test_scaled)

# Print test results
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Print best parameters
print("Best Parameters:", grid_search.best_params_)

# Check for overfitting
print("\nTraining Accuracy Score:", train_accuracy)
if train_accuracy > accuracy_score(y_val, y_val_pred):
    print("Potential Overfitting Detected: Training accuracy is greater than validation accuracy.")
else:
    print("No Overfitting Detected: Training accuracy is not greater than validation accuracy.")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming Engineered_df is your DataFrame
X = Engineered_df.drop(columns=['Churn'])
y = Engineered_df['Churn']

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Further split the training set to create a validation set
#X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Specify columns to scale
columns_to_scale = ['MonthlyCharges', 'TotalCharges', 'Customer_score', 'charges_ratio']

# Create a scaler
scaler = MinMaxScaler()

# Fit and transform only the specified columns in the training set
X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

# Introduce Gaussian noise to the training features
noise_std = 150  # Increased noise standard deviation
for column in columns_to_scale:
    noise = np.random.normal(0, noise_std, size=X_train_scaled[column].shape)
    X_train_scaled[column] += noise

# Introduce random label noise
label_noise_ratio = 0.3  # 10% of the labels will be randomly flipped
n_noisy_labels = int(label_noise_ratio * y_train.shape[0])
random_indices = np.random.choice(y_train.index, n_noisy_labels, replace=False)
y_train_noisy = y_train.copy()
y_train_noisy[random_indices] = np.random.choice([0, 1], n_noisy_labels)  # Flip the labels

# Transform only the specified columns in the validation and test sets
#X_val_scaled = X_val.copy()
#X_val_scaled[columns_to_scale] = scaler.transform(X_val[columns_to_scale])

X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# Print shapes
print("X_train shape:", X_train_scaled.shape)
print("y_train shape:", y_train_noisy.shape)
#print("X_val shape:", X_val_scaled.shape)
#print("y_val shape:", y_val.shape)

# Set up hyperparameter grid for tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l1', 'l2', 'elasticnet', None]
}

# Create Grid Search
grid_search = GridSearchCV(LogisticRegression(max_iter=300), param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled, y_train_noisy)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train_scaled, y_train_noisy, cv=5)
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

# Make predictions on the validation set using the best model
#y_val_pred = best_model.predict(X_val_scaled)

# Print validation results
#print("Validation Accuracy Score:", accuracy_score(y_val, y_val_pred))
#print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
#print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set using the best model
y_test_pred = best_model.predict(X_test_scaled)

# Print test results
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Print best parameters
print("Best Parameters:", grid_search.best_params_)


In [None]:
import numpy as np
from keras.layers import Dense, Dropout, BatchNormalization, LSTM
from keras.models import Sequential
from keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Encode target variable as binary
df_encoded['Churn'] = df_encoded['Churn'].replace({2: 0, 1: 1})

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape the data for RNN input (samples, timesteps, features)
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

# Initialize the RNN model
model = Sequential()

# First LSTM Layer
model.add(LSTM(64, input_shape=(X_train.shape[1], 1), return_sequences=True, kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Second LSTM Layer
model.add(LSTM(32, return_sequences=False, kernel_regularizer=l2(0.001)))  # return_sequences=False for the last LSTM
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Dense Layers
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Output Layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=128, validation_data=(X_test, y_test), verbose=1)

# Predictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
df_encoded.info()