In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression

In [None]:
df = pd.read_csv('users.csv')

In [None]:
df_repo = pd.read_csv('repositories.csv')

In [None]:
df.head()
df.shape

(337, 11)

In [None]:
df_repo.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,midudev,midudev/midu.dev,2018-11-20T21:29:52Z,451,451,HTML,False,False,GNU General Public License v3.0
1,midudev,midudev/landing-infojobs,2024-10-16T17:28:51Z,104,104,Astro,True,True,
2,midudev,midudev/midudev-issues,2024-10-16T09:58:41Z,11,11,,True,True,
3,midudev,midudev/cloudinary-hackathon-astro-example,2024-10-08T17:46:11Z,32,32,Astro,True,True,
4,midudev,midudev/javascript-100-proyectos,2024-02-14T12:00:57Z,2186,2186,HTML,True,True,Other


# Questions

Q1. Who are the top 5 users in Mumbai with the highest number of followers? List their login in order, comma-separated.

In [None]:
popular_logins = list(df.sort_values(by='followers', ascending=False).head()['login'])

In [None]:
for login in popular_logins:
    print(login, end=',')

midudev,ai,raysan5,vfarcic,spite,

Q2. Who are the 5 earliest registered GitHub users in Mumbai? List their login in ascending order of created_at, comma-separated.

In [None]:
early_logins = list(df.sort_values(by='created_at').head()['login'])

In [None]:
for login in early_logins:
    print(login, end=',')

oleganza,gravityblast,fesplugas,fxn,pauek,

Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [None]:
import csv
from collections import Counter

# Define the list to store license names
licenses = []

# Read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Check if the license_name field is present and not empty
        license_name = row.get('license_name', '').strip()
        if license_name:
            licenses.append(license_name)

# Count the occurrence of each license
license_counts = Counter(licenses)

# Get the 3 most common licenses
top_3_licenses = [license for license, count in license_counts.most_common(3)]

# Print the result as a comma-separated list
print(','.join(top_3_licenses))


null,MIT License,Apache License 2.0


Q4. Which company do the majority of these developers work at?

In [None]:
companies = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Get and clean up the company field (ignore empty values)
        company = row.get('company', '').strip()
        if company:
            companies.append(company)

# Count the occurrence of each company
company_counts = Counter(companies)

# Find the most common company
most_common_company = company_counts.most_common(5)

# Print the result
if most_common_company:
    print(most_common_company[0][0])
else:
    print("No company data found.")


null


Q5. Which programming language is most popular among these users?

In [None]:
df_repo['language'].value_counts().head(n=1).index[0]

'JavaScript'

Q6. Which programming language is the second most popular among users who joined after 2020?

In [None]:
def compare_dates(date):
    if int(date.split('-')[0]) >= 2020:
      return True
    else:
      return False


In [None]:
df['after_2020'] = df['created_at'].apply(compare_dates)

In [None]:
after_2020 = list(df[df['after_2020'] == True]['login'])

In [None]:
def after_2020_language(login):
    if login in after_2020:
        return True
    else:
        return False

In [None]:
df_repo['after_2020'] = df_repo['login'].apply(after_2020_language)

In [None]:
df_repo[df_repo['after_2020']]['language'].value_counts().head(n=2).index[-1]

'Python'

Q7. Which language has the highest average number of stars per repository?

In [None]:
df_repo.groupby('language').mean('stargazers_count').sort_values(by='stargazers_count', ascending=False).index[0]

'Vim Script'

Q8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [None]:
df['leader_strength'] = df['followers'] / (1 + df['following'])

In [None]:
leaders = list(df.sort_values(by='leader_strength', ascending=False).head()['login'])

In [None]:
for leader in leaders:
  print(leader, end=',')

midudev,vfarcic,spite,amix,cfenollosa,

Q9. What is the correlation between the number of followers and the number of public repositories among users in Mumbai?

In [None]:
df[['followers', 'public_repos']].corr()

Unnamed: 0,followers,public_repos
followers,1.0,0.07124
public_repos,0.07124,1.0


Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [None]:
lin_reg = LinearRegression()

In [None]:
X = df['public_repos']
y = df['followers']

In [None]:
lin_reg.fit(X.values.reshape(-1, 1), y)

In [None]:
lin_reg.coef_

array([1.03109318])

Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [None]:
df_repo[['has_projects', 'has_wiki']].corr()

Unnamed: 0,has_projects,has_wiki
has_projects,1.0,0.32292
has_wiki,0.32292,1.0


Q12. Do hireable users follow more people than those who are not hireable?

Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [None]:
df.groupby('hireable')['following'].mean()

Unnamed: 0_level_0,following
hireable,Unnamed: 1_level_1
False,97.640553
True,392.308333


In [None]:
392.308333-97.640553

294.66778

In [None]:
386.106557-96.330317

289.77624000000003

Q13. Some developers write long bios. Does that help them get more followers?

What's the correlation of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [None]:
def get_length(s):
    a = s.split()
    return len(a)

def analyze_bio_followers_correlation(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Filter out rows without bios
    df = df[df['bio'].notna() & (df['bio'] != '')]

    # Calculate bio length in Unicode characters
    df['bio_length'] = df['bio'].apply(get_length)

    # Prepare data for regression
    X = df['bio_length'].values.reshape(-1,1)
    y = df['followers'].values

    # Perform linear regression
    model = LinearRegression()
    model.fit(X, y)

    # Get the slope rounded to 3 decimal places
    slope = round(model.coef_[0], 3)

    # Print debug information
    print(f"Number of users with bios: {len(df)}")
    print(f"Bio length range: {df['bio_length'].min()} to {df['bio_length'].max()}")
    print(f"Followers range: {df['followers'].min()} to {df['followers'].max()}")
    print(f"R-squared: {model.score(X, y):.3f}")

    return slope

# Calculate the regression slope
result = analyze_bio_followers_correlation()
print(f"\nRegression slope: {result:.3f}")

Number of users with bios: 245
Bio length range: 1 to 31
Followers range: 101 to 28286
R-squared: 0.002

Regression slope: 13.721


Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [None]:
def is_weekend(date):
    year = date.split('-')[0]
    month = date.split('-')[1]
    day = date.split('-')[2][:2]

    date = datetime(int(year), int(month), int(day))
    if date.weekday() >= 5:
        return True
    else:
        return False

In [None]:
df_repo['weekend'] = df_repo['created_at'].apply(is_weekend)

In [None]:
weekend_logins = df_repo[df_repo['weekend'] == True]['login'].value_counts().head().index

In [None]:
for login in weekend_logins:
    print(login, end=',')

kinow,nilportugues,ajsb85,vfarcic,wlsf82,

Q15. Do people who are hireable share their email addresses more often?


[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
def analyze_email_sharing(users_csv_path='users.csv'):
    # Read the complete CSV file
    df = pd.read_csv(users_csv_path)

    # Convert email column to boolean (True if email exists, False if NaN or empty)
    df['has_email'] = df['email'].notna() & (df['email'] != '')

    # Calculate for hireable users
    hireable_mask = df['hireable'] == True
    if hireable_mask.any():
        hireable_email_fraction = df[hireable_mask]['has_email'].mean()
    else:
        hireable_email_fraction = 0

    # Calculate for non-hireable users
    non_hireable_mask = df['hireable'] != True
    if non_hireable_mask.any():
        non_hireable_email_fraction = df[non_hireable_mask]['has_email'].mean()
    else:
        non_hireable_email_fraction = 0

    # Calculate difference and round to 3 decimal places
    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)

    # Print debug information
    print(f"Total users: {len(df)}")
    print(f"Hireable users with email: {df[hireable_mask]['has_email'].sum()}/{hireable_mask.sum()}")
    print(f"Non-hireable users with email: {df[non_hireable_mask]['has_email'].sum()}/{non_hireable_mask.sum()}")
    print(f"Hireable fraction: {hireable_email_fraction:.3f}")
    print(f"Non-hireable fraction: {non_hireable_email_fraction:.3f}")

    return difference

# Read and analyze the complete dataset
result = analyze_email_sharing()
print(f"\nFinal result: {result:.3f}")

Total users: 337
Hireable users with email: 67/120
Non-hireable users with email: 100/217
Hireable fraction: 0.558
Non-hireable fraction: 0.461

Final result: 0.098


Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [None]:
def get_surname(name):
    if name is not None:
        return str(name).strip().split()[-1]
    else:
        return None

In [None]:
df['surname'] = df['name'].apply(get_surname)

In [None]:
df['surname'].value_counts().head()

Unnamed: 0_level_0,count
surname,Unnamed: 1_level_1
,7
Ortiz,3
Martínez,3
Perez,2
López,2
