In [11]:
import pandas as pd
import numpy as np

# Define the number of rows
num_rows = 7043

# Create a dictionary to store data
data = {
    'CustomerID': range(1, num_rows + 1),
    'Gender': np.random.choice(['Male', 'Female'], size=num_rows),
    'SeniorCitizen': np.random.choice([0, 1], size=num_rows),
    'Partner': np.random.choice(['Yes', 'No'], size=num_rows),
    'Dependents': np.random.choice(['Yes', 'No'], size=num_rows),
    'Tenure': np.random.randint(1, 72, size=num_rows),  # Varying tenure from 1 to 72 months
    'PhoneService': np.random.choice(['Yes', 'No'], size=num_rows),
    'MultipleLines': np.random.choice(['No', 'Yes', 'No phone service'], size=num_rows),
    'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], size=num_rows),
    'OnlineSecurity': np.random.choice(['Yes', 'No', 'No internet service'], size=num_rows),
    'OnlineBackup': np.random.choice(['Yes', 'No', 'No internet service'], size=num_rows),
    'DeviceProtection': np.random.choice(['Yes', 'No', 'No internet service'], size=num_rows),
    'TechSupport': np.random.choice(['Yes', 'No', 'No internet service'], size=num_rows),
    'StreamingTV': np.random.choice(['Yes', 'No', 'No internet service'], size=num_rows),
    'StreamingMovies': np.random.choice(['Yes', 'No', 'No internet service'], size=num_rows),
    'Contract': np.random.choice(['Month-to-Month', 'One Year', 'Two Year'], size=num_rows),
    'PaperlessBilling': np.random.choice(['Yes', 'No'], size=num_rows),
    'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], size=num_rows),
    'MonthlyCharges': np.random.uniform(20, 120, size=num_rows),  # Varying monthly charges
    'TotalCharges': np.random.uniform(100, 8000, size=num_rows),  # Varying total charges
    'Churn': np.random.choice([0, 1], size=num_rows)  # 0 for not churned, 1 for churned
}

# Create a DataFrame
df = pd.DataFrame(data)

# Introduce some duplicate rows
num_duplicates = 50
duplicate_rows = df.sample(n=num_duplicates)
df = pd.concat([df, duplicate_rows], ignore_index=True)

# Introduce some outliers in 'MonthlyCharges' column
num_outliers = 10
outliers = np.random.choice(df.index, num_outliers, replace=False)
df.loc[outliers, 'MonthlyCharges'] = df['MonthlyCharges'].max() + np.random.uniform(50, 200, num_outliers)

# Export to CSV
df.to_csv("Telecom_Customer_Churn.csv", index=False)


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [13]:
data=pd.read_csv("Telecom_Customer_Churn.csv")
print(data.index)

RangeIndex(start=0, stop=7093, step=1)


In [15]:
print(data.columns)

Index(['CustomerID', 'Gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'Tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


In [16]:
print(data.head())

   CustomerID  Gender  SeniorCitizen Partner Dependents  Tenure PhoneService  \
0           1    Male              1      No         No      60          Yes   
1           2  Female              1     Yes         No       9          Yes   
2           3    Male              0     Yes         No      38           No   
3           4  Female              0     Yes        Yes      65           No   
4           5    Male              1      No         No      42           No   

      MultipleLines InternetService       OnlineSecurity  ...  \
0                No     Fiber optic                  Yes  ...   
1  No phone service     Fiber optic  No internet service  ...   
2               Yes     Fiber optic                   No  ...   
3  No phone service             DSL  No internet service  ...   
4  No phone service             DSL                  Yes  ...   

      DeviceProtection          TechSupport          StreamingTV  \
0                   No                  Yes                 

In [17]:
print("Number of rows before removing duplicates:",len(data))

Number of rows before removing duplicates: 7093


In [18]:
data_cleaned=data.drop_duplicates()

In [19]:
print("Number of rows after removing duplicates:",len(data_cleaned))

Number of rows after removing duplicates: 7043


In [23]:
data.isnull().sum()

CustomerID          0
Gender              0
SeniorCitizen       0
Partner             0
Dependents          0
Tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [24]:
unique,counts=np.unique(data['TotalCharges'],return_counts=True)
print(unique,counts)

[ 101.00367436  101.5448489   101.86045316 ... 7998.9791295  7999.37057875
 7999.53016364] [1 1 1 ... 1 1 1]


In [28]:
import seaborn as sns

X=data.drop("MonthlyCharges",axis=1)
y=data["MonthlyCharges"]

X_train,X_test,y_train,y_test,=train_test_split(X,y,test_size=0.2,random_state=42)

In [29]:
X_train.shape

(5674, 20)

In [30]:
y_train.shape

(5674,)

In [31]:
X_test.shape

(1419, 20)

In [32]:
y_test.shape

(1419,)

In [33]:
data.to_csv("Cleaned_data_telecom.csv",index=False)