In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('users.csv')

In [3]:
df_repo = pd.read_csv('repositories.csv')

In [4]:
df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,midudev,Miguel Ángel Durán,,Barcelona,miduga@gmail.com,False,Te enseño Programación y Desarrollo Web. Cread...,194,28283,2,2012-03-21T18:31:36Z
1,ai,Andrey Sitnik,EVILMARTIANS,"Barcelona, Spain",andrey@sitnik.ru,False,"The creator of Autoprefixer, @postcss, @browse...",85,9156,139,2008-08-02T16:34:20Z
2,raysan5,Ray,RAYLIBTECH,Barcelona,raysan5@gmail.com,True,I make tools and technology for videogames dev...,26,3571,272,2013-10-24T15:41:54Z
3,vfarcic,Viktor Farcic,UPBOUND,"Barcelona, Spain",viktor@farcic.com,False,Developer Advocate @Upbound,451,2974,0,2013-10-18T07:28:17Z
4,spite,Jaume Sanchez,GOOGLE-DEEPMIND,London · Barcelona,hello@clicktorelease.com,False,"Web Dev, pixel burner - WebGL · WebXR",135,2547,0,2011-04-15T16:00:06Z


In [5]:
df_repo.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,midudev,midudev/landing-infojobs,2024-10-16T17:28:51Z,103,103,Astro,True,True,
1,midudev,midudev/midu.dev,2018-11-20T21:29:52Z,451,451,HTML,False,False,GNU General Public License v3.0
2,midudev,midudev/midudev-issues,2024-10-16T09:58:41Z,11,11,,True,True,
3,midudev,midudev/cloudinary-hackathon-astro-example,2024-10-08T17:46:11Z,32,32,Astro,True,True,
4,midudev,midudev/javascript-100-proyectos,2024-02-14T12:00:57Z,2184,2184,HTML,True,True,Other


# Questions

Q1. Who are the top 5 users in Mumbai with the highest number of followers? List their login in order, comma-separated.

In [6]:
popular_logins = list(df.sort_values(by='followers', ascending=False).head()['login'])

In [7]:
for login in popular_logins:
    print(login, end=',')

midudev,ai,raysan5,vfarcic,spite,

Q2. Who are the 5 earliest registered GitHub users in Mumbai? List their login in ascending order of created_at, comma-separated.

In [8]:
early_logins = list(df.sort_values(by='created_at').head()['login'])

In [9]:
for login in early_logins:
    print(login, end=',')

oleganza,gravityblast,fesplugas,fxn,pauek,

Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [10]:
pop_liscense = list(df_repo.value_counts('license_name').head(n=3).index)

In [11]:
for license in pop_liscense:
    print(license, end=',')

MIT License,Apache License 2.0,Other,

Q4. Which company do the majority of these developers work at?

In [12]:
df['company'].value_counts().head(n=1).index[0]

'FREELANCE'

Q5. Which programming language is most popular among these users?

In [13]:
df_repo['language'].value_counts().head(n=1).index[0]

'JavaScript'

Q6. Which programming language is the second most popular among users who joined after 2020?

In [14]:
def compare_dates(date):
    if int(date.split('-')[0]) > 2020:
      return True
    else:
      return False


In [15]:
df['after_2020'] = df['created_at'].apply(compare_dates)

In [16]:
after_2020 = list(df[df['after_2020'] == True]['login'])

In [17]:
def after_2020_language(login):
    if login in after_2020:
        return True
    else:
        return False

In [18]:
df_repo['after_2020'] = df_repo['login'].apply(after_2020_language)

In [19]:
df_repo[df_repo['after_2020']]['language'].value_counts().head(n=2).index[-1]

'JavaScript'

Q7. Which language has the highest average number of stars per repository?

In [20]:
df_repo.groupby('language').mean('stargazers_count').sort_values(by='stargazers_count', ascending=False).index[0]

'Vim Script'

Q8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [21]:
df['leader_strength'] = df['followers'] / (1 + df['following'])

In [22]:
leaders = list(df.sort_values(by='leader_strength', ascending=False).head()['login'])

In [23]:
for leader in leaders:
  print(leader, end=',')

midudev,vfarcic,spite,amix,cfenollosa,

Q9. What is the correlation between the number of followers and the number of public repositories among users in Mumbai?

In [24]:
df[['followers', 'public_repos']].corr()

Unnamed: 0,followers,public_repos
followers,1.0,0.072118
public_repos,0.072118,1.0


Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [25]:
lin_reg = LinearRegression()

In [26]:
X = df['public_repos']
y = df['followers']

In [27]:
lin_reg.fit(X.values.reshape(-1, 1), y)

In [28]:
lin_reg.coef_

array([1.04243512])

Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [29]:
df_repo[['has_projects', 'has_wiki']].corr()

Unnamed: 0,has_projects,has_wiki
has_projects,1.0,0.322438
has_wiki,0.322438,1.0


Q12. Do hireable users follow more people than those who are not hireable?

Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [30]:
df.groupby('hireable')['following'].mean()

Unnamed: 0_level_0,following
hireable,Unnamed: 1_level_1
False,96.330317
True,386.106557


In [31]:
386.106557-96.330317

289.77624000000003

Q13. Some developers write long bios. Does that help them get more followers?

What's the correlation of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [40]:
def get_length(s):
    a = s.split()
    return len(a)

def analyze_bio_followers_correlation(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Filter out rows without bios
    df = df[df['bio'].notna() & (df['bio'] != '')]

    # Calculate bio length in Unicode characters
    df['bio_length'] = df['bio'].apply(get_length)

    # Prepare data for regression
    X = df['bio_length'].values.reshape(-1,1)
    y = df['followers'].values

    # Perform linear regression
    model = LinearRegression()
    model.fit(X, y)

    # Get the slope rounded to 3 decimal places
    slope = round(model.coef_[0], 3)

    # Print debug information
    print(f"Number of users with bios: {len(df)}")
    print(f"Bio length range: {df['bio_length'].min()} to {df['bio_length'].max()}")
    print(f"Followers range: {df['followers'].min()} to {df['followers'].max()}")
    print(f"R-squared: {model.score(X, y):.3f}")

    return slope

# Calculate the regression slope
result = analyze_bio_followers_correlation()
print(f"\nRegression slope: {result:.3f}")

Number of users with bios: 250
Bio length range: 1 to 31
Followers range: 100 to 28283
R-squared: 0.002

Regression slope: 13.477


Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [41]:
def is_weekend(date):
    year = date.split('-')[0]
    month = date.split('-')[1]
    day = date.split('-')[2][:2]

    date = datetime(int(year), int(month), int(day))
    if date.weekday() >= 5:
        return True
    else:
        return False

In [42]:
df_repo['weekend'] = df_repo['created_at'].apply(is_weekend)

In [43]:
weekend_logins = df_repo[df_repo['weekend'] == True]['login'].value_counts().head().index

In [44]:
for login in weekend_logins:
    print(login, end=',')

kinow,nilportugues,ajsb85,vfarcic,wlsf82,

Q15. Do people who are hireable share their email addresses more often?


[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [55]:
def analyze_email_sharing(users_csv_path='users.csv'):
    # Read the complete CSV file
    df = pd.read_csv(users_csv_path)

    # Convert email column to boolean (True if email exists, False if NaN or empty)
    df['has_email'] = df['email'].notna() & (df['email'] != '')

    # Calculate for hireable users
    hireable_mask = df['hireable'] == True
    if hireable_mask.any():
        hireable_email_fraction = df[hireable_mask]['has_email'].mean()
    else:
        hireable_email_fraction = 0

    # Calculate for non-hireable users
    non_hireable_mask = df['hireable'] != True
    if non_hireable_mask.any():
        non_hireable_email_fraction = df[non_hireable_mask]['has_email'].mean()
    else:
        non_hireable_email_fraction = 0

    # Calculate difference and round to 3 decimal places
    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)

    # Print debug information
    print(f"Total users: {len(df)}")
    print(f"Hireable users with email: {df[hireable_mask]['has_email'].sum()}/{hireable_mask.sum()}")
    print(f"Non-hireable users with email: {df[non_hireable_mask]['has_email'].sum()}/{non_hireable_mask.sum()}")
    print(f"Hireable fraction: {hireable_email_fraction:.3f}")
    print(f"Non-hireable fraction: {non_hireable_email_fraction:.3f}")

    return difference

# Read and analyze the complete dataset
result = analyze_email_sharing()
print(f"\nFinal result: {result:.3f}")

Total users: 343
Hireable users with email: 69/122
Non-hireable users with email: 103/221
Hireable fraction: 0.566
Non-hireable fraction: 0.466

Final result: 0.100


Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [51]:
def get_surname(name):
    if name is not None:
        return str(name).strip().split()[-1]
    else:
        return None

In [53]:
df['surname'] = df['name'].apply(get_surname)

In [54]:
df['surname'].value_counts().head()

Unnamed: 0_level_0,count
surname,Unnamed: 1_level_1
,7
Ortiz,3
Martínez,3
López,3
Sanchez,2
