In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import IsolationForest


data = pd.read_csv('merged_dataset2.csv')

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,user_id,plan,session_date,mb_used,total_messages_used,total_minutes
0,1000,ultimate,2018-12-26,270.99,11.0,16.0
1,1000,ultimate,2018-12-27,880.22,11.0,16.0
2,1000,ultimate,2018-12-28,660.4,11.0,16.0
3,1000,ultimate,2018-12-29,89.86,11.0,16.0
4,1000,ultimate,2018-12-31,0.0,11.0,16.0


#### SEPARATING DATASETS IN ORDER TO FIND ANOMALIES IN EACH DATASET

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('merged_dataset2.csv')

# Filter the DataFrame for 'ultimate' plans
ultimate_df = df[df['plan'] == 'ultimate']

# Filter the DataFrame for 'surf' plans
surf_df = df[df['plan'] == 'surf']

# Save the DataFrames to new CSV files
ultimate_df.to_csv('ultimate_plans.csv', index=False)
surf_df.to_csv('surf_plans.csv', index=False)



#### CREATING SURF PLANS FINAL CSV

In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv('cleandatasets/surf_plans.csv')


# Compute the running totals
df['total_mb_used'] = df['mb_used'].cumsum()
df['total_messages_used'] = df['total_messages_used'].cumsum()
df['total_minutes'] = df['total_minutes'].cumsum()

# Drop the 'session_date' column as it's no longer needed
df = df.drop(columns=['session_date'])

# Group by 'user_id' and aggregate the data
# For 'mb_used', we will take the last value in the accumulated data as the total usage
# For 'total_messages_used' and 'total_minutes', we will also take the last accumulated values
aggregated_df = df.groupby('user_id').agg({
    'mb_used': 'last',
    'total_messages_used': 'last',
    'total_minutes': 'last',
    'total_mb_used': 'last'
}).reset_index()

# Save the final DataFrame to a new CSV file
aggregated_df.dropna(inplace=True)
aggregated_df.drop(columns=['mb_used'], inplace=True)
aggregated_df.to_csv('cleandatasets/surf_plans_final.csv', index=False)


CREATING A FINAL CSV FILE FOR ULTIMATE PLANS DATASET

In [16]:
import pandas as pd

# Load the dataset
df = pd.read_csv('cleandatasets/ultimate_plans.csv')


# Compute the running totals
df['total_mb_used'] = df['mb_used'].cumsum()
df['total_messages_used'] = df['total_messages_used'].cumsum()
df['total_minutes'] = df['total_minutes'].cumsum()

# Drop the 'session_date' column as it's no longer needed
df = df.drop(columns=['session_date'])

# Group by 'user_id' and aggregate the data
# For 'mb_used', we will take the last value in the accumulated data as the total usage
# For 'total_messages_used' and 'total_minutes', we will also take the last accumulated values
aggregated_df = df.groupby('user_id').agg({
    'mb_used': 'last',
    'total_messages_used': 'last',
    'total_minutes': 'last',
    'total_mb_used': 'last'
}).reset_index()

# Save the final DataFrame to a new CSV file
aggregated_df.dropna(inplace=True)
aggregated_df.drop(columns=['mb_used'], inplace=True)
aggregated_df.to_csv('cleandatasets/ultimate_plans_final.csv', index=False)


#### FINDING ANOMALIES IN THE ULTIMATE DATASET

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# Load the dataset
df = pd.read_csv('cleandatasets/ultimate_plans_final.csv')

# Print the first few rows to understand the structure of the dataset
print("Original DataFrame:")
print(df.head())

# Identify numerical columns
numerical_cols = ['total_messages_used', 'total_minutes', 'total_mb_used']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical columns
scaled_data = scaler.fit_transform(df[numerical_cols])

# Display the shape of scaled data to ensure correct scaling
print(f"Shape of scaled data: {scaled_data.shape}")

# Train Isolation Forest model
model = IsolationForest(contamination=0.1, random_state=42)
model.fit(scaled_data)

# Predict anomalies
anomalies = model.predict(scaled_data)

# Add anomalies to the original data
df['Anomaly'] = anomalies

# Filter the data to show only the anomalies
anomalies_data = df[df['Anomaly'] == -1]

# Print the anomalies
print("Anomalies Detected:")
print(anomalies_data)

# Save the anomalies to a new CSV file
anomalies_data.to_csv('cleandatasets/ultimate_plans_anomalies.csv', index=False)


Original DataFrame:
   user_id  total_messages_used  total_minutes  total_mb_used
0     1000                 55.0           80.0        1901.47
1     1006              10835.0          850.0       36088.66
2     1008              25619.0        39394.0       91561.70
3     1011             177729.0       208724.0      223340.30
4     1013             178801.0       211069.0      243454.22
Shape of scaled data: (131, 3)
Anomalies Detected:
     user_id  total_messages_used  total_minutes  total_mb_used  Anomaly
0       1000                 55.0           80.0        1901.47       -1
1       1006              10835.0          850.0       36088.66       -1
2       1008              25619.0        39394.0       91561.70       -1
3       1011             177729.0       208724.0      223340.30       -1
4       1013             178801.0       211069.0      243454.22       -1
5       1026             179681.0       213349.0      256678.68       -1
14      1039            1006917.0       838780

#### FINDING ANOMALIES IN THE SURF DATASET

In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# Load the dataset
df = pd.read_csv('cleandatasets/surf_plans_final.csv')

# Print the first few rows to understand the structure of the dataset
print("Original DataFrame:")
print(df.head())

# Identify numerical columns
numerical_cols = ['total_messages_used', 'total_minutes', 'total_mb_used']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical columns
scaled_data = scaler.fit_transform(df[numerical_cols])

# Display the shape of scaled data to ensure correct scaling
print(f"Shape of scaled data: {scaled_data.shape}")

# Train Isolation Forest model
model = IsolationForest(contamination=0.1, random_state=42)
model.fit(scaled_data)

# Predict anomalies
anomalies = model.predict(scaled_data)

# Add anomalies to the original data
df['Anomaly'] = anomalies

# Filter the data to show only the anomalies
anomalies_data = df[df['Anomaly'] == -1]

# Print the anomalies
print("Anomalies Detected:")
print(anomalies_data)

# Save the anomalies to a new CSV file
anomalies_data.to_csv('cleandatasets/surf_plans_anomalies.csv', index=False)


Original DataFrame:
   user_id  total_messages_used  total_minutes  total_mb_used
0     1001              50715.0        63945.0       80437.94
1     1002              61627.0        77957.0      120731.27
2     1003              64227.0        85705.0      147775.41
3     1004             145647.0       255905.0      304128.22
4     1005             146307.0       259445.0      321268.39
Shape of scaled data: (262, 3)
Anomalies Detected:
     user_id  total_messages_used  total_minutes  total_mb_used  Anomaly
0       1001              50715.0        63945.0       80437.94       -1
1       1002              61627.0        77957.0      120731.27       -1
2       1003              64227.0        85705.0      147775.41       -1
3       1004             145647.0       255905.0      304128.22       -1
4       1005             146307.0       259445.0      321268.39       -1
5       1007             215922.0       363185.0      465121.13       -1
6       1014             217382.0       990286

In [None]:
from prophet import Prophet

df2 = df.drop(columns=['plan','total_messages_used', 'total_minutes'])
df2.head()

Unnamed: 0,user_id,session_date,mb_used
0,1000,2018-12-26,270.99
1,1000,2018-12-27,880.22
2,1000,2018-12-28,660.4
3,1000,2018-12-29,89.86
4,1000,2018-12-31,0.0


In [23]:
# Ensure both encoded_data and scaled_data have the correct dimensions
print(f'encoded_data shape: {encoded_data.shape}')
print(f'scaled_data shape: {scaled_data.shape}')

# Check the dimensions of each element in the tuple
print(f'Dimensions of the first element in the tuple: {encoded_data.ndim}')
print(f'Dimensions of the second element in the tuple: {scaled_data.ndim}')

# Combining encoded categorical data with scaled numerical data
try:
    processed_data = np.hstack((encoded_data, scaled_data))
    print(f'processed_data shape: {processed_data.shape}')
except ValueError as e:
    print(f'Error: {e}')
    print(f'encoded_data: {encoded_data[:5]}')
    print(f'scaled_data: {scaled_data[:5]}')


encoded_data shape: (7032, 27)
scaled_data shape: (7032, 4)
Dimensions of the first element in the tuple: 2
Dimensions of the second element in the tuple: 2
Error: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
encoded_data:   (0, 1)	1.0
  (0, 4)	1.0
  (0, 11)	1.0
  (0, 22)	1.0
  (0, 24)	1.0
  (1, 0)	1.0
  (1, 3)	1.0
  (1, 9)	1.0
  (1, 13)	1.0
  (1, 20)	1.0
  (1, 25)	1.0
  (2, 0)	1.0
  (2, 3)	1.0
  (2, 9)	1.0
  (2, 11)	1.0
  (2, 22)	1.0
  (2, 25)	1.0
  (2, 26)	1.0
  (3, 0)	1.0
  (3, 4)	1.0
  (3, 9)	1.0
  (3, 13)	1.0
  (3, 15)	1.0
  (3, 20)	1.0
  (4, 3)	1.0
  (4, 6)	1.0
  (4, 22)	1.0
  (4, 24)	1.0
  (4, 26)	1.0
scaled_data: [[-0.44032709 -1.28024804 -1.16169394 -0.99419409]
 [-0.44032709  0.06430269 -0.26087792 -0.17373982]
 [-0.44032709 -1.23950408 -0.36392329 -0.95964911]
 [-0.44032709  0.51248626 -0.74785042 -0.19524771]
 [-0.44032709 -1.23950408  0.19617818 -0.94045745]]


In [None]:
# Cell 11: Combine encoded categorical data with scaled numerical data
processed_data = np.hstack((encoded_data, scaled_data))

# Display the shape of processed data to ensure the combination is correct
processed_data.shape


### after facing the previous error, i had to revise the work and i found that i was using the wrong approach to solve the problem.

## The entire workflow as a script

In [25]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import IsolationForest

# Cell 2: Load the dataset
data = pd.read_csv('usage.csv')

# Display the first few rows of the dataset to understand its structure
#data.head()

# Cell 3: Drop irrelevant columns
data.drop(columns=['customerID'], inplace=True)

# Cell 4: Handle missing values and convert numerical columns to appropriate type
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')  # Convert TotalCharges to numeric, setting errors to NaN
data.dropna(inplace=True)  # Drop rows with NaN values

# Display the first few rows to ensure changes
#data.head()

# Cell 5: Identify categorical columns
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                    'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']

# Cell 6: Initialize the OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Cell 7: Fit and transform the categorical columns
encoded_data = encoder.fit_transform(data[categorical_cols])

# Display the shape of encoded data to understand the number of features created
encoded_data.shape

# Cell 8: Identify numerical columns
numerical_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

# Cell 9: Initialize the StandardScaler
scaler = StandardScaler()

# Cell 10: Fit and transform the numerical columns
scaled_data = scaler.fit_transform(data[numerical_cols])

# Display the shape of scaled data to ensure correct scaling
scaled_data.shape

# # Ensure both encoded_data and scaled_data have the correct dimensions
# print(f'encoded_data shape: {encoded_data.shape}')
# print(f'scaled_data shape: {scaled_data.shape}')

# # Check the dimensions of each element in the tuple
# print(f'Dimensions of the first element in the tuple: {encoded_data.ndim}')
# print(f'Dimensions of the second element in the tuple: {scaled_data.ndim}')

# Cell 11: Combine encoded categorical data with scaled numerical data
try:
    processed_data = np.hstack((encoded_data, scaled_data))
    #print(f'processed_data shape: {processed_data.shape}')
except ValueError as e:
    print(f'Error: {e}')
    print(f'encoded_data: {encoded_data[:5]}')
    print(f'scaled_data: {scaled_data[:5]}')

# Continue with the Isolation Forest model training and prediction
# Cell 12: Train Isolation Forest model
model = IsolationForest(contamination=0.1, random_state=42)
model.fit(processed_data)

# Cell 13: Predict anomalies
anomalies = model.predict(processed_data)

# Cell 14: Add anomalies to the original data
data['Anomaly'] = anomalies

# Cell 15: Display the anomalies
anomalies_data = data[data['Anomaly'] == -1]
print(anomalies_data)


      gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
27      Male              0     Yes        Yes       1           No   
54    Female              1     Yes        Yes      60          Yes   
62      Male              0     Yes         No      72           No   
71    Female              0     Yes        Yes      52          Yes   
79    Female              0     Yes        Yes      45          Yes   
...      ...            ...     ...        ...     ...          ...   
6996  Female              0     Yes        Yes      41          Yes   
7000  Female              0      No         No      67          Yes   
7007    Male              1     Yes         No      72           No   
7031    Male              1     Yes         No      55          Yes   
7036  Female              0      No         No      12           No   

         MultipleLines InternetService       OnlineSecurity  \
27    No phone service             DSL                   No   
54                  N

### Writing the anomalies to a CSV file

In [26]:

anomalies_data.to_csv('anomalies.csv', index=False)

FOR MESSAGES

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('megaline_messages.csv')

# Calculate the total messages per user
user_message_counts = df['user_id'].value_counts().reset_index()
user_message_counts.columns = ['user_id', 'total_messages_used']

# Merge the total messages information back into the original DataFrame
df = df.merge(user_message_counts, on='user_id')

# Display the first few rows to verify
print(df.head())


         id  user_id message_date  total_messages_used
0  1000_125     1000   2018-12-27                   11
1  1000_160     1000   2018-12-31                   11
2  1000_223     1000   2018-12-31                   11
3  1000_251     1000   2018-12-27                   11
4  1000_255     1000   2018-12-26                   11


In [44]:
import pandas as pd

# Load the dataset
df = pd.read_csv('megaline_messages.csv')

# Calculate the total messages per user
user_message_counts = df['user_id'].value_counts().reset_index()
user_message_counts.columns = ['user_id', 'total_messages_used']

# Merge the total messages information back into the original DataFrame
df = df.merge(user_message_counts, on='user_id')

# Drop the 'id' column
df = df.drop(columns=['id', 'message_date'])

# Drop duplicates to ensure one row per user_id
df = df.drop_duplicates(subset=['user_id'])

# Display the first few rows to verify
print(df.head())


df.to_csv('cleaned_messages.csv', index=False)

     user_id  total_messages_used
0       1000                   11
11      1001                  207
218     1002                   88
306     1003                   50
356     1004                  177


CALLS

In [39]:
df = pd.read_csv('megaline_calls.csv')

# calculate the total minutes oer user

minutes_count = df['user_id'].value_counts().reset_index()
minutes_count.columns = ['user_id', 'total_minutes']

df = df.merge(minutes_count, on= 'user_id')

df = df.drop_duplicates(subset = ['user_id'])

df.head()

Unnamed: 0,id,user_id,call_date,duration,total_minutes
0,1000_93,1000,2018-12-27,8.52,16
16,1001_0,1001,2018-09-06,10.06,261
277,1002_0,1002,2018-11-14,12.32,113
390,1003_0,1003,2018-12-28,0.0,149
539,1004_0,1004,2018-11-28,8.82,370


In [40]:
# Check if 'duration' column exists, then drop it
if 'duration' in df.columns:
    df = df.drop(columns=['duration', 'id'])
    print("\nUpdated DataFrame:")
    print(df.head())
else:
    print("\n'duration' column not found in the DataFrame.")


Updated DataFrame:
     user_id   call_date  total_minutes
0       1000  2018-12-27             16
16      1001  2018-09-06            261
277     1002  2018-11-14            113
390     1003  2018-12-28            149
539     1004  2018-11-28            370


In [41]:



df = df.drop(columns=['call_date'])


df.head()

Unnamed: 0,user_id,total_minutes
0,1000,16
16,1001,261
277,1002,113
390,1003,149
539,1004,370


In [42]:
df.to_csv('cleaned_calls.csv', index=False)

FOR THE INTERNET USAGE

In [47]:
import pandas as pd

# Load the dataset
df = pd.read_csv('megaline_internet.csv')

# Calculate the total mb_used per user
user_mb_used_totals = df.groupby('user_id')['mb_used'].sum().reset_index()
user_mb_used_totals.columns = ['user_id', 'total_mb_used']

# Merge the total mb_used information back into the original DataFrame
df = df.merge(user_mb_used_totals, on='user_id')

# Drop the 'id'
df = df.drop(columns=['id'])

# Save the cleaned DataFrame to a new CSV file
df.to_csv('trendusage.csv', index=False)

# Display the first few rows of the cleaned DataFrame to verify
print(df.head())


   user_id session_date  mb_used  total_mb_used
0     1000   2018-12-29    89.86        1901.47
1     1000   2018-12-31     0.00        1901.47
2     1000   2018-12-28   660.40        1901.47
3     1000   2018-12-26   270.99        1901.47
4     1000   2018-12-27   880.22        1901.47


CLEANING USERS

In [45]:
import pandas as pd


df = pd.read_csv('megaline_users.csv')

df = df.drop(columns=['first_name', 'last_name', 'age', 'city', 'reg_date', 'churn_date'])

df.head()

Unnamed: 0,user_id,plan
0,1000,ultimate
1,1001,surf
2,1002,surf
3,1003,surf
4,1004,surf


In [46]:
df.to_csv('cleaned_users.csv', index=False)