In [5]:
import sys
import os
import pandas as pd
import numpy as np
from scipy.stats import zscore
sys.path.append(os.path.abspath('C:/Users/nejat/AIM Projects/Telecommunication Data Analysis/src'))
from db_connection import PostgresConnection
from data_cleaning import preprocess_data  

def load_data():
    query = "SELECT * FROM xdr_data"
    db = PostgresConnection(dbname='tellco', user='postgres', password='237132')
    db.connect()
    result = db.execute_query(query)
    df = pd.DataFrame(result, columns=[desc[0] for desc in db.cursor.description])
    db.close_connection()
    print(df.head()) 
    return df

def analyze_handsets(df):
    # Identify top 10 handsets
    top_handsets = df['Handset Type'].value_counts().head(10)
    print("Top 10 handsets:")
    print(top_handsets)
    
    # Identify top 3 handset manufacturers
    top_manufacturers = df['Handset Manufacturer'].value_counts().head(3)
    print("\nTop 3 manufacturers:")
    print(top_manufacturers)
    
    # Top 5 handsets per top 3 manufacturers
    for manufacturer in top_manufacturers.index:
        top_handsets_manufacturer = df[df['Handset Manufacturer'] == manufacturer]['Handset Type'].value_counts().head(5)
        print(f"\nTop 5 handsets for {manufacturer}:")
        print(top_handsets_manufacturer)

def aggregate_user_behavior(df):
    aggregated_data = df.groupby('IMSI').agg({
        'Bearer Id': 'count',
        'Dur. (ms)': 'sum',
        'Total DL (Bytes)': 'sum',
        'Total UL (Bytes)': 'sum',
        'HTTP DL (Bytes)': 'sum',
        'HTTP UL (Bytes)': 'sum',
        'Social Media DL (Bytes)': 'sum',
        'Social Media UL (Bytes)': 'sum',
        'Youtube DL (Bytes)': 'sum',
        'Youtube UL (Bytes)': 'sum',
        'Netflix DL (Bytes)': 'sum',
        'Netflix UL (Bytes)': 'sum',
        'Google DL (Bytes)': 'sum',
        'Google UL (Bytes)': 'sum',
        'Email DL (Bytes)': 'sum',
        'Email UL (Bytes)': 'sum',
        'Gaming DL (Bytes)': 'sum',
        'Gaming UL (Bytes)': 'sum',
        'Other DL (Bytes)': 'sum',
        'Other UL (Bytes)': 'sum'
    }).reset_index()
    return aggregated_data

def main_analysis():
    df = load_data()
    df = preprocess_data(df)  
    
    analyze_handsets(df)
    
    user_behavior_df = aggregate_user_behavior(df)
    print("\nUser Behavior Aggregated Data:")
    print(user_behavior_df.head())

main_analysis()


Connected to PostgreSQL database!
Connection closed.
      Bearer Id            Start  Start ms              End  End ms  \
0  1.311448e+19   4/4/2019 12:01     770.0  4/25/2019 14:35   662.0   
1  1.311448e+19   4/9/2019 13:04     235.0   4/25/2019 8:15   606.0   
2  1.311448e+19   4/9/2019 17:42       1.0  4/25/2019 11:58   652.0   
3  1.311448e+19   4/10/2019 0:31     486.0   4/25/2019 7:36   171.0   
4  1.311448e+19  4/12/2019 20:10     565.0  4/25/2019 10:40   954.0   

   Dur. (ms)          IMSI  MSISDN/Number          IMEI  \
0  1823652.0  2.082014e+14   3.366496e+10  3.552121e+13   
1  1365104.0  2.082019e+14   3.368185e+10  3.579401e+13   
2  1361762.0  2.082003e+14   3.376063e+10  3.528151e+13   
3  1321509.0  2.082014e+14   3.375034e+10  3.535661e+13   
4  1089009.0  2.082014e+14   3.369980e+10  3.540701e+13   

      Last Location Name  ...  Youtube DL (Bytes)  Youtube UL (Bytes)  \
0  9.16456699548519E+015  ...          15854611.0           2501332.0   
1                L7