In [1]:
# Import Necessary modules
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Load dataset
df = pd.read_csv("Connections.csv")

In [3]:
# Top 5 content in the dataset
df.head()

Unnamed: 0,First Name,Last Name,Email Address,Company,Position,Connected On
0,Deepak,Seth,,Accenture,Technology Consulting Principal Director,22 May 2020
1,Afzal,Mansury,,OYO,Project Lead,21 May 2020
2,Nayanava,De,,Microsoft,Software Development Engineer II,21 May 2020
3,Siddhartha,Banerjee,,Traveloka,Senior Data Scientist,21 May 2020
4,Wei Hong,Low,,Shopee,Regional Data Scientist,21 May 2020


In [4]:
# Information regarding the dataaset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6560 entries, 0 to 6559
Data columns (total 6 columns):
First Name       6540 non-null object
Last Name        6540 non-null object
Email Address    87 non-null object
Company          6481 non-null object
Position         6482 non-null object
Connected On     6560 non-null object
dtypes: object(6)
memory usage: 307.6+ KB


In [5]:
# converting date format
import datetime
def fun(date):    
  return datetime.datetime.strptime(date, "%d %b %Y").strftime("%Y-%m-%d")

df["Connected On"] = df["Connected On"].apply(fun)

In [6]:
# sort the values according to date you connected
df=df.sort_values(by="Connected On")

In [7]:
df.head()

Unnamed: 0,First Name,Last Name,Email Address,Company,Position,Connected On
6559,•Aditya,•Varma,,Genus Power infrastructure ltd,Intern,2013-06-17
6558,Anjali,Gupta,,,,2017-05-29
6557,Aman,Goel,,AllinCall,Co-founder & CEO,2017-06-06
6556,Saatvik,Saini,,CORE VALUE TECHNOLOGIES PRIVATE LIMITED,Software Engineer Intern,2017-07-19
6555,Prakhar,Katariya,,Swami Keshwanand Inst. Of Tech. Mgt. & Gramoth...,Student,2017-07-19


In [8]:
# adding new column by replcing the previous index value with new one
df.reset_index(inplace=True)
df.reset_index(inplace=True)

In [9]:
df.head()

Unnamed: 0,level_0,index,First Name,Last Name,Email Address,Company,Position,Connected On
0,0,6559,•Aditya,•Varma,,Genus Power infrastructure ltd,Intern,2013-06-17
1,1,6558,Anjali,Gupta,,,,2017-05-29
2,2,6557,Aman,Goel,,AllinCall,Co-founder & CEO,2017-06-06
3,3,6556,Saatvik,Saini,,CORE VALUE TECHNOLOGIES PRIVATE LIMITED,Software Engineer Intern,2017-07-19
4,4,6555,Prakhar,Katariya,,Swami Keshwanand Inst. Of Tech. Mgt. & Gramoth...,Student,2017-07-19


In [10]:
# drop the index column and rename the column level_0 to number
df.drop(columns="index",inplace=True)
df.rename(columns={"level_0":"number"},inplace=True)

In [11]:
df.head()

Unnamed: 0,number,First Name,Last Name,Email Address,Company,Position,Connected On
0,0,•Aditya,•Varma,,Genus Power infrastructure ltd,Intern,2013-06-17
1,1,Anjali,Gupta,,,,2017-05-29
2,2,Aman,Goel,,AllinCall,Co-founder & CEO,2017-06-06
3,3,Saatvik,Saini,,CORE VALUE TECHNOLOGIES PRIVATE LIMITED,Software Engineer Intern,2017-07-19
4,4,Prakhar,Katariya,,Swami Keshwanand Inst. Of Tech. Mgt. & Gramoth...,Student,2017-07-19


In [29]:
# line graph representing number of connections added in a particular year
connections_line = px.line(df, x="Connected On", y="number", title='My Connections')
connections_line.show()
connections_line.write_html("line_graph.html")

In [13]:
# drop the email address column
df.drop(columns='Email Address',inplace=True)

In [14]:
# drop the null values corresponding to a row
df=df.dropna()

In [15]:
df.head()

Unnamed: 0,number,First Name,Last Name,Company,Position,Connected On
0,0,•Aditya,•Varma,Genus Power infrastructure ltd,Intern,2013-06-17
2,2,Aman,Goel,AllinCall,Co-founder & CEO,2017-06-06
3,3,Saatvik,Saini,CORE VALUE TECHNOLOGIES PRIVATE LIMITED,Software Engineer Intern,2017-07-19
4,4,Prakhar,Katariya,Swami Keshwanand Inst. Of Tech. Mgt. & Gramoth...,Student,2017-07-19
5,5,Dinesh,Sharma,BOSE CORPORATION INDIA PRIVATE LIMITED,"ONLINE AND SOCIAL MEDIA Designer,",2017-08-23


In [16]:
# replacing the word which contains intern in position column to a new word i.e intern
df.loc[df['Position'].str.contains('Intern'),'Position'] = "Intern"

In [17]:
df.head(10)

Unnamed: 0,number,First Name,Last Name,Company,Position,Connected On
0,0,•Aditya,•Varma,Genus Power infrastructure ltd,Intern,2013-06-17
2,2,Aman,Goel,AllinCall,Co-founder & CEO,2017-06-06
3,3,Saatvik,Saini,CORE VALUE TECHNOLOGIES PRIVATE LIMITED,Intern,2017-07-19
4,4,Prakhar,Katariya,Swami Keshwanand Inst. Of Tech. Mgt. & Gramoth...,Student,2017-07-19
5,5,Dinesh,Sharma,BOSE CORPORATION INDIA PRIVATE LIMITED,"ONLINE AND SOCIAL MEDIA Designer,",2017-08-23
6,6,vidyottama,kanoria,Internshala,Intern,2017-08-26
7,7,Prateek,Dwivedi,GENX SOFT TECHNOLOGIES (P) LTD,Intern,2017-09-07
8,8,Sourabh,Khatri,Vishal Mega Mart (Airplaza Retail Holdings Pri...,Talent Acquisition Specialist,2017-09-23
9,9,Akshay,Sethi,IntrCity by RailYatri,Product Manager,2017-11-15
10,10,Pankaj,Sharma,Videndaa,Co-Founder,2017-11-21


In [18]:
# store the company column
Companies = df['Company']

In [30]:
# histogram represnting number of connections added belongs to a particular company
hist_company = go.Figure()
hist_company.add_trace(go.Histogram(histfunc="count",  x=Companies),)
hist_company.show()
hist_company.write_html("company.html")

In [20]:
# store the values in company column to a list(top 80 companies according to highest number of connections)
company = df['Company'].value_counts()[:79].keys().tolist()
count_comp = df['Company'].value_counts()[:79].tolist()

In [31]:
# bar chart represnting the companies where mostly connections are added 
fig = go.Figure([go.Bar(x=company, y=count_comp)])
fig.show()
fig.write_html("top_connections_company.html")

In [33]:
# histogram representing positions of connections working in their present company
fig = go.Figure()
fig.add_trace(go.Histogram(histfunc="count",  x=df["Position"]))
fig.show()
fig.write_html("positions.html")

In [23]:
# store the position values into list(top 80 connections worked at a particular position in their present company)
position = df['Position'].value_counts()[:79].keys().tolist()
count_pos = df['Position'].value_counts()[:79].tolist()

In [24]:
print(len(company),len(position))

79 79


In [34]:
# bar chart representing the top positions my connections are working in their present company
fig = go.Figure([go.Bar(x=position, y=count_pos)])
fig.show()
fig.write_html("top_connections_positions.html")

In [26]:
# adding new column which contains my network as values
df['My Network']= 'My Network'

In [35]:
# tree map representing the my network working at which position in which company(Greater the area, greater the number of counts)
position_tree_map = px.treemap(df, path=['My Network', 'Position', 'Company'], width=1000, height=1000)
position_tree_map.show()
position_tree_map.write_html("position_map.html")

In [36]:
# tree map representing the my network working in which company at which position(Greater the area, greater the number of counts)
company_tree_map = px.treemap(df, path=['My Network', 'Company', 'Position'], width=1000, height=1000)
company_tree_map.show()
company_tree_map.write_html("company_map.html")