<a href="https://colab.research.google.com/github/meyywwg/tds_project1/blob/main/analysis_project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#code for scraping
import requests
import csv
import time

# GitHub API token
GITHUB_TOKEN = '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'
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Helper function to clean up company names
def clean_company_name(company):
    if company:
        company = company.strip().lstrip('@').upper()
    return company

# Function to fetch users from the GitHub API
def fetch_users(city="Mumbai", min_followers=50):
    users = []
    page = 1

    while True:
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>{min_followers}&page={page}&per_page=100"
        response = requests.get(url, headers=HEADERS)
        data = response.json()

        # Break if no more results
        if 'items' not in data or not data['items']:
            break

        for user in data['items']:
            # Get full user info
            user_url = user['url']
            user_response = requests.get(user_url, headers=HEADERS)
            user_data = user_response.json()

            # Extract required fields
            users.append({
                'login': user_data['login'],
                'name': user_data['name'],
                'company': clean_company_name(user_data['company']),
                'location': user_data['location'],
                'email': user_data['email'],
                'hireable': user_data['hireable'],
                'bio': user_data['bio'],
                'public_repos': user_data['public_repos'],
                'followers': user_data['followers'],
                'following': user_data['following'],
                'created_at': user_data['created_at'],
            })
        page += 1
        time.sleep(1)  # Avoid hitting API rate limits

    return users

# Function to fetch repositories for a user
def fetch_repositories(user_login):
    repositories = []
    page = 1

    while True:
        url = f"https://api.github.com/users/{user_login}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        repo_data = response.json()

        # Break if no more repositories
        if not repo_data:
            break

        for repo in repo_data:
            repositories.append({
                'login': user_login,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })

        # If fewer than 100 repositories are returned, it means we're on the last page
        if len(repo_data) < 100:
            break

        page += 1  # Move to the next page
        time.sleep(1)  # Avoid hitting API rate limits

    return repositories

# Save users to CSV
def save_users_to_csv(users, filename="users.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=users[0].keys())
        writer.writeheader()
        writer.writerows(users)

# Save repositories to CSV
def save_repositories_to_csv(repositories, filename="repositories.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=repositories[0].keys())
        writer.writeheader()
        writer.writerows(repositories)

def main():
    print("Fetching users...")
    users = fetch_users()
    save_users_to_csv(users)
    print(f"Saved {len(users)} users to users.csv")

    print("Fetching repositories...")
    all_repositories = []
    for user in users:
        user_repos = fetch_repositories(user["login"])
        all_repositories.extend(user_repos)
        print(f"Fetched {len(user_repos)} repositories for user {user['login']}")

    save_repositories_to_csv(all_repositories)
    print(f"Saved {len(all_repositories)} repositories to repositories.csv")

if __name__ == "__main__":
    main()

Fetching users...
Saved 724 users to users.csv
Fetching repositories...
Fetched 66 repositories for user ValentineFernandes
Fetched 37 repositories for user kovidgoyal
Fetched 113 repositories for user slidenerd
Fetched 88 repositories for user aryashah2k
Fetched 11 repositories for user coding-parrot
Fetched 9 repositories for user gkcs
Fetched 53 repositories for user darshilparmar
Fetched 330 repositories for user Kushal334
Fetched 77 repositories for user ritz078
Fetched 144 repositories for user PrasoonPratham
Fetched 35 repositories for user rmehta
Fetched 69 repositories for user LakshyaDuhoonISU
Fetched 61 repositories for user dmalvia
Fetched 41 repositories for user PiyushKumarSingh-90
Fetched 89 repositories for user mfaisalkhatri
Fetched 34 repositories for user omsandippatil
Fetched 17 repositories for user anujvyas
Fetched 89 repositories for user jalajthanaki
Fetched 81 repositories for user alisolanki
Fetched 65 repositories for user ForceGT
Fetched 139 repositories for

In [1]:
#Q1
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Sort by followers and get top 5
top_users = users_df.sort_values(by='followers', ascending=False).head(5)

# Extract logins
top_logins = top_users['login'].tolist()
result = ', '.join(top_logins)

print(result)

ValentineFernandes, kovidgoyal, slidenerd, aryashah2k, coding-parrot


In [2]:
#Q2
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Convert created_at to datetime
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by created_at and get the earliest 5 users
earliest_users = users_df.sort_values(by='created_at').head(5)

# Extract logins
earliest_logins = earliest_users['login'].tolist()
result = ', '.join(earliest_logins)

print(result)


ivank, sandeepshetty, svs, nitinhayaran, nischal


In [3]:
#Q3
import pandas as pd

# Load the data
repositories_df = pd.read_csv('repositories.csv')

# Filter out missing license names
repositories_df = repositories_df[repositories_df['license_name'].notna()]

# Count occurrences of each license
license_counts = repositories_df['license_name'].value_counts()

# Get the top 3 licenses
top_licenses = license_counts.head(3).index.tolist()

# Join the license names in order
result = ', '.join(top_licenses)

print(result)

mit, apache-2.0, other


In [4]:
#Q4
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Count occurrences of each company
company_counts = users_df['company'].value_counts()

# Get the company with the highest count
most_common_company = company_counts.idxmax()
most_common_count = company_counts.max()

print(f"The majority of developers work at: {most_common_company} with {most_common_count} developers.")

The majority of developers work at: MASAI SCHOOL with 14 developers.


In [5]:
#Q5
import pandas as pd

# Load the data
repositories_df = pd.read_csv('repositories.csv')

# Count occurrences of each programming language, ignoring missing values
language_counts = repositories_df['language'].value_counts()

# Get the most popular programming language
most_popular_language = language_counts.idxmax()
most_popular_count = language_counts.max()

print(f"The most popular programming language is: {most_popular_language} with {most_popular_count} repositories.")

The most popular programming language is: JavaScript with 8305 repositories.


In [6]:
#Q6
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')
repositories_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime and filter users who joined after 2020
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
recent_users = users_df[users_df['created_at'] > '2020-01-01']

# Get the logins of recent users
recent_user_logins = recent_users['login'].tolist()

# Filter repositories by these users
recent_repositories = repositories_df[repositories_df['login'].isin(recent_user_logins)]

# Count occurrences of each programming language
language_counts = recent_repositories['language'].value_counts()

# Get the second most popular programming language
second_most_popular_language = language_counts.nlargest(2).index[1]
second_most_popular_count = language_counts.nlargest(2).values[1]

print(f"The second most popular programming language among users who joined after 2020 is: {second_most_popular_language} with {second_most_popular_count} repositories.")


The second most popular programming language among users who joined after 2020 is: HTML with 1304 repositories.


In [7]:
#Q7
import pandas as pd

# Load the data
repositories_df = pd.read_csv('repositories.csv')

# Group by programming language and calculate the average stars
average_stars = repositories_df.groupby('language')['stargazers_count'].mean()

# Identify the language with the highest average stars
highest_average_language = average_stars.idxmax()
highest_average_value = average_stars.max()

print(f"The programming language with the highest average number of stars per repository is: {highest_average_language} with an average of {highest_average_value:.2f} stars.")


The programming language with the highest average number of stars per repository is: TSQL with an average of 571.00 stars.


In [8]:
#Q8
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength and get the top 5
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Extract logins
top_logins = top_leaders['login'].tolist()
result = ', '.join(top_logins)

print(result)


kovidgoyal, coding-parrot, gkcs, slidenerd, dmalvia


In [9]:
#Q9
import pandas as pd

# Load the data
users_df = pd.read_csv('users.csv')

# Calculate the correlation between followers and public repositories
correlation = users_df['followers'].corr(users_df['public_repos'])

print(f"The correlation between the number of followers and the number of public repositories is: {correlation:.3f}")

The correlation between the number of followers and the number of public repositories is: 0.034


In [11]:
import pandas as pd
import statsmodels.api as sm

# Load the data
users_df = pd.read_csv('users.csv')

# Check for NaNs and infinite values in the relevant columns
print("Checking for NaNs or infinite values in public_repos and followers...")
print(users_df[['public_repos', 'followers']].isnull().sum())  # Count of NaNs
print((~users_df[['public_repos', 'followers']].isin([float('inf'), float('-inf')])).all())  # Check for infs

# Option 1: Drop rows with NaN or infinite values
users_df = users_df.dropna(subset=['public_repos', 'followers'])
users_df = users_df[(users_df['public_repos'] != float('inf')) & (users_df['public_repos'] != float('-inf'))]
users_df = users_df[(users_df['followers'] != float('inf')) & (users_df['followers'] != float('-inf'))]

# Define the independent variable (X) and dependent variable (Y)
X = users_df['public_repos']
Y = users_df['followers']

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(Y, X).fit()

# Get the summary of the regression results
summary = model.summary()

# Extract the coefficient for public_repos
additional_followers_per_repo = model.params['public_repos']

print(f"Regression Results:\n{summary}")
print(f"Estimated additional followers per additional public repository: {additional_followers_per_repo:.3f}")


Checking for NaNs or infinite values in public_repos and followers...
public_repos    1
followers       1
dtype: int64
public_repos    True
followers       True
dtype: bool
Regression Results:
                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.8467
Date:                Thu, 31 Oct 2024   Prob (F-statistic):              0.358
Time:                        14:10:44   Log-Likelihood:                -5238.0
No. Observations:                 724   AIC:                         1.048e+04
Df Residuals:                     722   BIC:                         1.049e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 

In [14]:
#Q11
import pandas as pd

# Load the data
repositories_df = pd.read_csv('repositories.csv')


# Calculate the correlation directly
correlation = repositories_df['has_projects'].corr(repositories_df['has_wiki'])

print(f"The correlation between having projects enabled and having a wiki enabled is: {correlation:.3f}")


The correlation between having projects enabled and having a wiki enabled is: 0.154


In [15]:
#Q12
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

# Calculate average following for both groups
average_hireable_following = hireable_users['following'].mean()
average_non_hireable_following = non_hireable_users['following'].mean()

# Calculate the difference
difference = average_hireable_following - average_non_hireable_following

# Print the result rounded to three decimal places
print(f'Difference in average following (hireable - non-hireable): {difference:.3f}')

Difference in average following (hireable - non-hireable): 8.175


In [17]:
#Q13
import pandas as pd
import statsmodels.api as sm

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna()]

# Calculate the length of the bio in words
#users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split(" ").str.len()

users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


# Prepare the data for regression
X = users_with_bios['bio_word_count']  # Independent variable
y = users_with_bios['followers']        # Dependent variable

# Add a constant to the independent variable for the regression
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the regression slope (coefficient for bio_word_count)
slope = model.params['bio_word_count']

# Print the slope rounded to three decimal places
print(f'Regression slope of followers on bio word count: {slope:.3f}')


Regression slope of followers on bio word count: -0.378


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


In [18]:
#Q14
import pandas as pd

# Load the repositories data from the CSV file
repos_df = pd.read_csv('repositories.csv')

# Convert the created_at column to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for weekend days (Saturday: 5, Sunday: 6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek.isin([5, 6])]

# Count the number of repositories created by each user
top_users = weekend_repos['login'].value_counts().head(5)

# Get the top 5 users' logins in order
top_users_logins = ', '.join(top_users.index)

# Print the result
print(f'Top 5 users who created the most repositories on weekends: {top_users_logins}')

Top 5 users who created the most repositories on weekends: mohd14shoeb, vinod1988, Kushal334, patilswapnilv, alokproc


In [22]:
#Q15
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Total number of users
total_users = len(users_df)

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

# Calculate the fraction of users with email in both groups
fraction_hireable_with_email = hireable_users['email'].notna().mean()
fraction_non_hireable_with_email = non_hireable_users['email'].notna().mean()

# Calculate the difference
difference = fraction_hireable_with_email - fraction_non_hireable_with_email

# Print the result rounded to three decimal places
print(f'Difference in fraction of users with email: {difference:.3f}')

Difference in fraction of users with email: 0.223


In [20]:
#Q16
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Filter out users without names
valid_users = users_df[users_df['name'].notna()]

# Extract surnames (last word in name)
valid_users['surname'] = valid_users['name'].str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = valid_users['surname'].value_counts()

# Find the most common surname(s)
max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Sort surnames alphabetically
most_common_surnames.sort()

# Count users with the most common surname
number_of_users = max_count

# Print results
most_common_surnames_str = ', '.join(most_common_surnames)
print(f'Most common surname(s): {most_common_surnames_str}')
print(f'Number of users with the most common surname: {number_of_users}')

Most common surname(s): Singh
Number of users with the most common surname: 17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_users['surname'] = valid_users['name'].str.strip().str.split().str[-1]
