## Graph making and analysis

In [6]:
import os
import csv
import json
from collections import defaultdict

def normalize_name(name):
    name = name.strip()
    name = name.lower()
    name_parts = name.split()
    title_case_name = " ".join(part.capitalize() for part in name_parts)
    return title_case_name

def find_valid_headers(file_path):
    with open(file_path, newline='', encoding='utf-8-sig', errors='replace') as csvfile:
        reader = csv.reader(csvfile)
        count = 0
        for row in reader:
            if "First Name" in row and "Last Name" in row:
                return row
            count += 1
            if count >= 5:
                break
    return None

def generate_network_graph(data_folder):
    graph = defaultdict(set)
    names_from_files = set()
    errors = []

    for file in os.listdir(data_folder):
        if not file.endswith(".csv"):
            continue

        person_name = os.path.splitext(file)[0]
        person_name = normalize_name(person_name)
        names_from_files.add(person_name)
        graph[person_name]

        file_path = os.path.join(data_folder, file)
        headers = find_valid_headers(file_path)
        if headers is None:
            continue

        with open(file_path, newline='', encoding='utf-8-sig', errors='replace') as csvfile:
            reader = csv.DictReader(csvfile, fieldnames=headers)
            next(reader, None)

            row_number = 2
            for row in reader:
                try:
                    first = row.get("First Name", "")
                    last = row.get("Last Name", "")
                    first = normalize_name(first)
                    last = normalize_name(last)
                    full_name = (first + " " + last).strip()
                    if full_name:
                        graph[person_name].add(full_name)
                        graph[full_name].add(person_name)
                except:
                    errors.append(f"{file} row {row_number}")
                row_number += 1

    final_graph = {}
    for person in graph:
        final_graph[person] = sorted(list(graph[person]))
    return final_graph

def save_json(data, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

def save_adjacency_list(graph, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for name in sorted(graph.keys()):
            line = name + " --> " + ", ".join(graph[name]) + "\n"
            f.write(line)

# Main Execution
data_folder = r"C:\\Desktop\\MFC ass\\LinkedIn Data Public"
graph = generate_network_graph(data_folder)

save_json(graph, "network_graph.json")
save_adjacency_list(graph, "adjacency_list.txt")

print("Graph created successfully and saved to JSON and TEXT file\n")
print("Adjacency List (Top 100 Nodes):\n")

# Print only top 100 nodes alphabetically
for name in sorted(graph.keys())[:100]:
    line = name + " --> " + ", ".join(graph[name])
    print(line)


Graph created successfully and saved to JSON and TEXT file

Adjacency List (Top 100 Nodes):

A D S David Babu --> Ayush Kumar
A Dilli Prasad --> Yuvraj Chirag
A K M --> Gaurav Rathore
A K Saini Phd --> Neeraj Parmar
A K Singh --> Ramraj Nagar
A Moksha Sulif --> Pranjal Dubey
A Nandhan --> Byagari Praveen Kumar
Aabha Shukla --> Prabhat Patidar
Aabhash Mukherjee --> Himanshu Kumar, Rohit Malviya
Aabir Mukhopadhyay --> Anamika Kumari, Ekta Kumari
Aachal Bansod --> Manoj Kharkar
Aachal Sharma --> Divyanshi Sahu
Aadarsh Armil --> Alok Raj
Aadarsh Gupta --> Manoj Kharkar
Aadarsh Kumar --> Ekta Kumari, Gaurav Rathore, Pawan Kushwah, Rahul Verma
Aadarsh Kumar Nirmal --> Shilpi Shaw
Aadarsh Negi --> Aman Singh, Aryan Saini
Aadarsh Ranjan --> Manoj Dewda, Nirmal Mewada
Aadarsh S --> Ravi Rajput
Aadarsh Singh --> Aryan Saini, Prabhat Patidar, Ravi Rajput
Aadarsh Sridhar --> Rohit Malviya
Aadesh Alawe --> Anuradha Tiwari, Himanshu Kumar, Monu Rajpoot
Aadesh Bishnoi --> Chandan Giri, Pragati Chauha

### Cleaned graph data

In [12]:
import json
import re

def clean_name(name):
    # Remove non-alphabetic characters and extra spaces
    name = re.sub(r'[^A-Za-z\s]', '', name)
    name = " ".join(name.strip().split())  # Normalize spacing
    return name.title()

def clean_json_file(input_path, output_path):
    print(f"Cleaning JSON file: {input_path}...")
    
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    cleaned_data = {}
    cleaned_count = 0  # To track how many records were cleaned
    
    for key, values in data.items():
        clean_key = clean_name(key)
        clean_values = [clean_name(v) for v in values if clean_name(v)]

        if clean_values:  # Only include if there are connections
            cleaned_data[clean_key] = sorted(set(clean_values))
            cleaned_count += 1
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, indent=4)
    
    print(f"Cleaning completed. Cleaned {cleaned_count} records.")
    print(f"Cleaned JSON saved to: {output_path}")

def clean_text_file(input_path, output_path):
    print(f"Cleaning Text file: {input_path}...")
    
    cleaned_lines = []
    cleaned_count = 0  # To track how many lines were cleaned
    
    with open(input_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        if "-->" in line:
            name, connections = line.split("-->")
            name = clean_name(name)
            connections_list = [clean_name(c.strip()) for c in connections.split(",") if clean_name(c.strip())]

            if connections_list:  # Only include if there are connections
                cleaned_line = name + " --> " + ", ".join(sorted(set(connections_list)))
                cleaned_lines.append(cleaned_line)
                cleaned_count += 1
    
    with open(output_path, "w", encoding="utf-8") as f:
        for line in cleaned_lines:
            f.write(line + "\n")
    
    print(f"Cleaning completed. Cleaned {cleaned_count} lines.")
    print(f"Cleaned Text saved to: {output_path}")

# Clean both the JSON and Text files
clean_json_file("network_graph.json", "cleaned_network_graph.json")
clean_text_file("adjacency_list.txt", "cleaned_adjacency_list.txt")

print("All cleaning tasks completed successfully.")


Cleaning JSON file: network_graph.json...
Cleaning completed. Cleaned 28616 records.
Cleaned JSON saved to: cleaned_network_graph.json
Cleaning Text file: adjacency_list.txt...
Cleaning completed. Cleaned 28616 lines.
Cleaned Text saved to: cleaned_adjacency_list.txt
All cleaning tasks completed successfully.


In [8]:
import json

def calculate_degrees(graph):
    degrees = {}
    
    for student, connections in graph.items():
        degrees[student] = len(connections)  # Degree is the number of connections
    
    return degrees

def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def save_degrees(degrees, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for student, degree in degrees.items():
            f.write(f"{student}: {degree}\n")
            print(f"{student}: {degree}")  # Print to console


graph_data = load_json("network_graph.json")
degrees = calculate_degrees(graph_data)
save_degrees(degrees, "degrees.txt")


Aaditya Raj: 271
Harsh Kumar Singh: 4
Saurabh Singh: 79
Anamika Kumari: 880
Hariom Parmar: 78
Sukheâ Ðº: 33
Mani Kumar: 57
Himanshu Srivastav: 476
Mudasir Ahmad: 7
Ayush Katiyar: 2
Radhika Pal: 14
Punam Mandal: 76
Prakash Behera: 67
Nikita Paikara: 7
Mohammad Zaid: 22
Rudraksh Namdeo: 1
Ankit Kumar: 68
Aakash Kumar: 110
Amrita Yadav: 51
Adityansh Chand: 8
Mohd Shadab Ansari: 54
Rohit Pingale: 1
Amit Diwakar: 85
Siddharth Sahani: 1
Hrishikesh Dixit: 2
Vishal Bhardwaj: 383
Mohit Sharma: 1266
Divyanshu Fartiyal: 7
Ritik Singh: 142
Rudra Pratap Singh Chauhan: 1
Abhinav Mishra: 22
Vishal Dhaniya: 42
Amrita Kumari: 2
Chhaya Pawar: 47
Shivani Parashwar: 1
Nishant Rasekar: 2
Neha Sawadkar: 3
Abhishek Tripathi: 93
Nalini Agarwal: 1
Prince Yadav: 25
Abhishek Kumar: 102
Rani Kumari: 307
Nitinnimish Rastogi: 1
Nikhil Raj Soni: 122
Manvendra Singh: 100
Ranjan Singh: 107
Guman Singh: 108
Maneesh Sakhwar: 104
Challa Trivedh Kumar: 2915
Mayank Singh Tomar: 86
Deependra Shukla: 94
Priyanshu Yadav: 10
M

### Given any pair of students, find a random walk connecting them.

Prune this random walk to find a path connecting the given pair or
students.

In [9]:
import json
import random
import matplotlib.pyplot as plt
import networkx as nx

def random_walk_and_prune(graph, source, target, max_steps=100):
    path = [source]
    current = source
    while current != target and len(path) < max_steps:
        neighbors = list(graph.get(current, []))
        if not neighbors:
            break
        current = random.choice(neighbors)
        path.append(current)
    print(f"Original Random Walk Path: {path}")

    pruned_path = []
    seen = set()

    for node in path:
        if node not in seen:
            seen.add(node)
            pruned_path.append(node)
    if pruned_path[0] != source:
        pruned_path.insert(0, source)
    if pruned_path[-1] != target:
        pruned_path.append(target) 
    print(f"Pruned Path: {pruned_path}")
    return path, pruned_path

with open('network_graph.json', 'r') as f:
    graph = json.load(f)

source_student = "Manoj Dewda"
target_student = "Yuvraj Chirag"
path, pruned_path = random_walk_and_prune(graph, source_student, target_student)



Original Random Walk Path: ['Manoj Dewda', 'Pankaj Mewada', 'Manoj Dewda', 'Priyanshi Gothwal', 'Manoj Dewda', 'David Charoh', 'Aryan Saini', 'Aman Chaudhari', 'Aryan Saini', 'Dinesh Chandra Mishra', 'Shubham Kang', 'Shamail Khan', 'Yuvraj Chirag']
Pruned Path: ['Manoj Dewda', 'Pankaj Mewada', 'Priyanshi Gothwal', 'David Charoh', 'Aryan Saini', 'Aman Chaudhari', 'Dinesh Chandra Mishra', 'Shubham Kang', 'Shamail Khan', 'Yuvraj Chirag']


### Some statistical estimates of the length of these random walks and the pruned paths.

In [10]:
import json
import random
import numpy as np

def random_walk_and_prune(graph, source, target, max_steps=100):
    path = [source]
    current = source

    while current != target and len(path) < max_steps:
        neighbors = list(graph.get(current, []))
        if not neighbors:
            break
        current = random.choice(neighbors)
        path.append(current)

    pruned_path = []
    seen = set()

    for node in path:
        if node not in seen:
            seen.add(node)
            pruned_path.append(node)

    if pruned_path[0] != source:
        pruned_path.insert(0, source)
    if pruned_path[-1] != target:
        pruned_path.append(target)

    return path, pruned_path

def compute_statistics(path, pruned_path):
    print("\n--- RANDOM WALK ANALYSIS ---")

    random_walk_length = len(path)
    pruned_path_length = len(pruned_path)
    unique_nodes = len(set(path))

    reduction = ((random_walk_length - pruned_path_length) / random_walk_length) * 100 if random_walk_length else 0
    path_density = unique_nodes / random_walk_length if random_walk_length else 0

    lengths = [random_walk_length, pruned_path_length]
    mean_length = np.mean(lengths)
    median_length = np.median(lengths)
    std_dev_length = np.std(lengths)
    min_len = np.min(lengths)
    max_len = np.max(lengths)

    print(f"Source Node                    : {path[0]}")
    print(f"Target Node                    : {path[-1]}")
    print(f"Total Steps in Random Walk     : {random_walk_length}")
    print(f"Steps after Pruning (Unique)   : {pruned_path_length}")
    print(f"Total Unique Nodes Visited     : {unique_nodes}")
    print(f"Reduction in Steps (%)         : {reduction:.2f}%")
    print(f"Path Density (Unique/Total)    : {path_density:.2f}")

    print("\n--- STATISTICAL SUMMARY ---")
    print(f"Mean Path Length               : {mean_length:.2f}")
    print(f"Median Path Length             : {median_length}")
    print(f"Standard Deviation             : {std_dev_length:.2f}")
    print(f"Minimum Path Length            : {min_len}")
    print(f"Maximum Path Length            : {max_len}")

    print("\n--- FULL RANDOM WALK PATH ---")
    print(" -> ".join(path))

    print("\n--- PRUNED UNIQUE PATH ---")
    print(" -> ".join(pruned_path))

# Load graph
with open('network_graph.json', 'r') as f:
    graph = json.load(f)

source_student = "Manoj Dewda"
target_student = "Yuvraj Chirag"

path, pruned_path = random_walk_and_prune(graph, source_student, target_student)
compute_statistics(path, pruned_path)



--- RANDOM WALK ANALYSIS ---
Source Node                    : Manoj Dewda
Target Node                    : Yuvraj Chirag
Total Steps in Random Walk     : 79
Steps after Pruning (Unique)   : 66
Total Unique Nodes Visited     : 66
Reduction in Steps (%)         : 16.46%
Path Density (Unique/Total)    : 0.84

--- STATISTICAL SUMMARY ---
Mean Path Length               : 72.50
Median Path Length             : 72.5
Standard Deviation             : 6.50
Minimum Path Length            : 66
Maximum Path Length            : 79

--- FULL RANDOM WALK PATH ---
Manoj Dewda -> Sanjay V P -> Bhaskar Mahato -> Shankho Subhra Pal -> Samina Sultana -> Chelsi Saini -> Gaurav Rathore -> Gagan Kumar -> Gaurav Rathore -> Aditya Singhal -> Ravi Rajput -> Sushant Suryawanshi -> Rahul Verma -> Sachin Zade -> Vishal Kumar -> Suraj Mishra -> Nirmal Mewada -> Shivani Verma -> Nirmal Mewada -> Kunal Karn -> Rohit Malviya -> Siddhartha Porika -> Ravi Rajput -> Subrat Behera -> Neeraj Parmar -> Apurva Kamble -> Rohi

In [11]:
import json
import statistics

with open("network_graph.json", "r") as f:
    graph = json.load(f)

# Total students
total_students = len(graph)

# Total unique connections (edges)
total_connections = sum(len(neighbors) for neighbors in graph.values()) // 2

# Degree of each student
degrees = {student: len(neighbors) for student, neighbors in graph.items()}

# Top 5 most connected students
top_connected = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:5]

# Students with 0 connections
isolated_students = [student for student, neighbors in graph.items() if len(neighbors) == 0]

# Average degree
average_degree = sum(degrees.values()) / total_students

# Density of the graph
if total_students > 1:
    density = (2 * total_connections) / (total_students * (total_students - 1))
else:
    density = 0

# Display statistics
print("--- LinkedIn Network Statistics ---")
print("Total Students:", total_students)
print("Total Unique Connections:", total_connections)
print("Average Degree:", round(average_degree, 2))
print("Graph Density:", round(density, 4))
print("Top 5 Connected Students:")
for name, deg in top_connected:
    print(f"  {name}: {deg} connections")
print("Students with 0 Connections:", len(isolated_students))

# Sample Random Walk & Pruned Path Analysis
import random

def random_walk_and_prune(graph, source, target, max_steps=100):
    path = [source]
    current = source
    while current != target and len(path) < max_steps:
        neighbors = list(graph.get(current, []))
        if not neighbors:
            break
        current = random.choice(neighbors)
        path.append(current)
    pruned_path = []
    seen = set()
    for node in path:
        if node not in seen:
            seen.add(node)
            pruned_path.append(node)
    if pruned_path[0] != source:
        pruned_path.insert(0, source)
    if pruned_path[-1] != target:
        pruned_path.append(target)
    return len(path), len(pruned_path)

# Run stats for multiple pairs
walk_lengths = []
pruned_lengths = []
students = list(graph.keys())

for _ in range(10):
    s1, s2 = random.sample(students, 2)
    walk_len, pruned_len = random_walk_and_prune(graph, s1, s2)
    walk_lengths.append(walk_len)
    pruned_lengths.append(pruned_len)

print("\n--- Random Walk vs Pruned Path Statistics ---")
print("Average Random Walk Length:", round(statistics.mean(walk_lengths), 2))
print("Average Pruned Path Length:", round(statistics.mean(pruned_lengths), 2))
print("Median Random Walk Length:", statistics.median(walk_lengths))
print("Median Pruned Path Length:", statistics.median(pruned_lengths))
print("Std Dev of Random Walk:", round(statistics.stdev(walk_lengths), 2))
print("Std Dev of Pruned Path:", round(statistics.stdev(pruned_lengths), 2))

--- LinkedIn Network Statistics ---
Total Students: 28616
Total Unique Connections: 101200
Average Degree: 7.07
Graph Density: 0.0002
Top 5 Connected Students:
  Rohit Malviya: 4261 connections
  Ravi Rajput: 4072 connections
  Manoj Dewda: 3971 connections
  Ramraj Nagar: 3716 connections
  Nirmal Mewada: 3527 connections
Students with 0 Connections: 0

--- Random Walk vs Pruned Path Statistics ---
Average Random Walk Length: 100
Average Pruned Path Length: 78.8
Median Random Walk Length: 100.0
Median Pruned Path Length: 78.5
Std Dev of Random Walk: 0.0
Std Dev of Pruned Path: 2.1
