In [None]:
pip install gender-detector



In [None]:
import pandas as pd
from gender_detector import gender_detector as gd
from datetime import date
import plotly.graph_objects as go
import plotly.express as px  

In [None]:
df_connections = pd.read_csv('Connections.csv')

In [None]:
df_connections.head(1)

Unnamed: 0,First Name,Last Name,Email Address,Company,Position,Connected On
0,maria pepe,Test,tes@test.com,Test,Ceo,2021-12-14


In [None]:
df_connections.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   First Name     1918 non-null   object
 1   Last Name      1918 non-null   object
 2   Email Address  57 non-null     object
 3   Company        1849 non-null   object
 4   Position       1851 non-null   object
 5   Connected On   1918 non-null   object
dtypes: object(6)
memory usage: 90.0+ KB


In [None]:
df_connections.isnull().sum()

First Name          0
Last Name           0
Email Address    1861
Company            69
Position           67
Connected On        0
dtype: int64

In [None]:
df_connections.iloc[:,[3,4,5]].head()

Unnamed: 0,Company,Position,Connected On
0,Test,Ceo,2021-12-14
1,Novakorp,Data Analyst,2021-12-14
2,Webstarted,RECRUITER IT,2021-12-14
3,Deckard Technologies,Data Entry Analyst,2021-12-14
4,Aicoll,Junior Data Analyst,2021-12-14


Analizamos las compañías desde la cantidad de conexiones

In [None]:
df_connections_company = df_connections['Company'].value_counts().rename_axis('Company').reset_index(name='cantidad').sort_values(by='cantidad', ascending=False)
df_connections_company.head()

Unnamed: 0,Company,cantidad
0,Mercado Libre,58
1,Springboard,31
2,COREBI Data & Analytics,28
3,Quales Group,18
4,Freelance,16


In [None]:
df_connections_company = df_connections_company[df_connections_company['cantidad'] >10]

In [None]:
fig = px.bar(df_connections_company, x='cantidad', y='Company',
             labels={'cantidad': 'Conexiones por compañia'},
             height=400,
             orientation='h')
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Conexiones en Linkedin')

fig.show()

Filtro por Posicion para saber que puesto es el que tiene mas conexiones en mi perfil

In [None]:
df_position_filter = df_connections.groupby(['Company','Position'])['Connected On'].count().reset_index(name='cantidad').sort_values(by='cantidad', ascending=False)
df_position_filter.head()

Unnamed: 0,Company,Position,cantidad
1455,Springboard,Data Science Fellow,8
932,Mercado Libre,Data Scientist,7
1449,Springboard,Data Analyst Fellow,6
1247,Quales Group,Especialista BI & Analytics,5
1090,Novakorp,Data Engineer,4


In [None]:
fig = px.bar(df_position_filter, x='cantidad', y='Position',
             labels={'cantidad': 'Conexiones por roles'},
             height=400,
             orientation='h')
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Conexiones en Linkedin por roles')

fig.show()

In [None]:
df_position_filter = df_position_filter[df_position_filter['cantidad'] > 2]

Empezamos a trabajar con los nombres para detectar el genero

In [None]:
df_connections.iloc[:,[0]].head()

Unnamed: 0,First Name
0,maria pepe
1,Cecilia
2,Gabriela
3,Bella
4,Carlos Andres


In [None]:
df_connections['Connected On'] = pd.to_datetime(df_connections.loc[:,'Connected On'])

In [None]:
df_connections.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   First Name     1918 non-null   object        
 1   Last Name      1918 non-null   object        
 2   Email Address  57 non-null     object        
 3   Company        1849 non-null   object        
 4   Position       1851 non-null   object        
 5   Connected On   1918 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 90.0+ KB


In [None]:
df_connections_count = df_connections.groupby('Connected On')['First Name'].count().reset_index(name='Count').sort_values(by='Count', ascending=False)

In [None]:
df_connections_count.head()

Unnamed: 0,Connected On,Count
321,2021-12-11,116
322,2021-12-12,70
323,2021-12-13,58
218,2021-07-28,50
177,2021-06-15,48


Distribución por fecha de conexion

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = df_connections_count['Connected On'], y=df_connections_count['Count'],
                         mode = 'markers'))

fig.show()

Tratamiento en el nombre para detectar los generos

In [None]:
df_connections.iloc[:,[0]].head()

Unnamed: 0,First Name
0,maria pepe
1,Cecilia
2,Gabriela
3,Bella
4,Carlos Andres


Separar el nombre y quedarnos solo con el primero

In [None]:
name = df_connections["First Name"].str.split(expand=True)
name.columns = ['first', 'second', 'third', 'fourth']

df_connections = pd.concat([df_connections, name], axis=1)
df_connections.iloc[:,[0,5,6,7,8,9]].head()

Unnamed: 0,First Name,Connected On,first,second,third,fourth
0,maria pepe,2021-12-14,maria,pepe,,
1,Cecilia,2021-12-14,Cecilia,,,
2,Gabriela,2021-12-14,Gabriela,,,
3,Bella,2021-12-14,Bella,,,
4,Carlos Andres,2021-12-14,Carlos,Andres,,


In [None]:
detector = gd.GenderDetector('us')

In [None]:
detector.guess('Sofia')

'female'

In [None]:
def gender_detector(x):
  try:
      gender = detector.guess(x)
      return gender
  except:
      return 'no detectó'

In [None]:
df_connections['gender'] = df_connections['first'].apply(lambda x: gender_detector(x))

In [None]:
df_connections.gender.value_counts()

male       1331
female      584
unknown       3
Name: gender, dtype: int64

Detectamos cuales fueron los nombres que no identificó el genero para tratarlos: Desconocidos


In [None]:
df2 = df_connections.loc[df_connections.gender == 'unknown']

In [None]:
df2.iloc[:,[0,5,6,7,8,9]].head()

Unnamed: 0,First Name,Connected On,first,second,third,fourth
8,Leo,2021-12-11,Leo,,,
10,Jaime,2021-12-11,Jaime,,,
11,Cristian,2021-09-14,Cristian,,,
29,Fabián Hernán,2021-12-13,Fabián,Hernán,,
30,Hernán Ceferino,2021-12-13,Hernán,Ceferino,,


Por otro lado identificamos los nombres que directamente se descartaron (6 'no detectó')

In [None]:
df_null = df_connections.loc[df_connections.gender == 'no detectó']
df_null.iloc[:,[0,5]].head()

Unnamed: 0,First Name,Connected On
330,İsmail,2021-10-20
580,İbrahim,2021-09-11
1437,Érika Agustina,2021-06-17
1567,Óscar,2021-06-07
1684,👨🏽‍💻Solomon,2021-05-04


Removemos los caracteres especiales de first_name

Explicar que hace unicodedata

In [None]:
import unicodedata

In [None]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode('utf-8')

In [None]:
df_connections['first'] = df_connections['first'].apply(remove_accents)

Quiero ver los generos, grafico, posiciones

In [None]:
df_connections_gender = df_connections['gender'].value_counts().rename_axis('Gender').reset_index(name='cantidad').sort_values(by='cantidad', 
                                                                             ascending=False)
df_connections_gender.head()

Unnamed: 0,Gender,cantidad
0,male,1331
1,female,584
2,unknown,3


In [None]:
df_gender_position = df_connections.groupby(['Position','gender'])['First Name'].count().reset_index(name='Count').sort_values(by='Count',
                                                                                                        ascending=False)

In [None]:
df_gender_position.head()

Unnamed: 0,Position,gender,Count
459,Data Scientist,male,102
362,Data Analyst,male,66
458,Data Scientist,female,44
401,Data Engineer,male,39
361,Data Analyst,female,26


In [None]:
df_gender_position = df_gender_position[df_gender_position.Count > 20]

In [None]:
fig = px.bar(df_gender_position, x='Position', y='Count',
              labels={'cantidad':'conexiones por roles y géneros'}, 
              height=400,
              orientation='v',
              color='gender')

fig.show()

Empezamos a analizar las empresas por ejemplo

In [None]:
df_connections[df_connections.Company == 'Mercado Libre'].Position.value_counts()

Data Scientist                                                            7
Software Development Analyst                                              2
Ssr Software Engineer                                                     2
Software Developer                                                        2
Software Developer Analyst                                                2
Data Engineer                                                             2
Software Developer Junior                                                 1
Research Manager                                                          1
iOS Developer                                                             1
Senior Data Science and Machine Learning Engineer                         1
Sales Analyst                                                             1
Ssr. Data Scientist - Applied Machine Learning Team                       1
Supervisor BI - Pricing                                                   1
Machine Lear

In [None]:
df_connections[df_connections.Company == 'Mercado Libre'].gender.value_counts()

male      35
female    23
Name: gender, dtype: int64