# Libs

In [20]:
import requests as req
import pandas as pd
import json
from datetime import datetime
import os
from keys import TOKEN_GITHUB

In [21]:
TOKEN_GITHUB

'TESTE'

# Utils 

## Consts

In [22]:
DOMAIN = 'https://api.github.com/graphql'
TOKEN = ''
HEADERS = {
  'Authorization': f'bearer {TOKEN}',
  'Content-Type': 'application/json'
}
TODAY = datetime.now()
NUMBER_OF_REPOSITORIES=1000
PRIMARY_LANGUANGE = 'java'

ROOT_PATH = os.getcwd().split('\\lab-experimentacao-02')[0].replace('\\', '/')
BASE_URL_GITHUB = 'https://github.com/'
CK_REPO = 'https://github.com/mauricioaniche/ck'

## Functions

In [12]:
def doPost(data : json)-> json:
  response = req.post(DOMAIN, headers=HEADERS, json=data)
  if response.status_code == 200:
    return response.json()

  raise Exception(f'Erro ao fazer requisição: {response.status_code} \n {response.text}')

def analisar_createdAt(repositories: list):
  data_frame = pd.DataFrame(repositories)
  created_dates = [datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ') for date in data_frame['createdAt']]
  age_years = [round((TODAY - date).days / 365, 2) for date in created_dates]
  return age_years

def clone_repo(dir_path: str, repo_name: str)-> bool:
  repo_url = f'{BASE_URL_GITHUB}/{repo_name}'
  os.chdir(dir_path)
  return os.system(f'git clone {repo_url}') == 0

    
def execute_ck(repo_name: str, use_jars: str = 'true',max_files_per_partition: int = 0,  variables_and_fields_metrics: str = 'false')-> bool:
  ck_path = f'{ROOT_PATH}/ck/target/ck-0.7.1-SNAPSHOT-jar-with-dependencies.jar'
  project_path = f'{ROOT_PATH}/{repo_name}'
  destiny_path = f'{ROOT_PATH}/output/{repo_name}/'
  
  if not os.path.exists(destiny_path):
    os.makedirs(destiny_path)
    
  command_to_run_ck = f'java -jar {ck_path} {project_path} {use_jars} {max_files_per_partition} {variables_and_fields_metrics} {destiny_path}'
  return os.system(command_to_run_ck) == 0
  
    
def delete_repo(repo_name: str = None, *repo_url: str)->bool:
  if not repo_name:
        if repo_url:
            repo_name = repo_url[0].split('/')[-1].replace('.git', '')
        else:
            raise ValueError("repo_name must be provided or calculated from repo_url")
  
  return os.system(f'rmdir /S /Q {repo_name}') == 0

def join_csv(path_files: list):
    if not path_files:
        raise Exception("Lista de arquivos vazia.")
    
    dataframes = []
    
    for path in path_files:
        try:
            df = pd.read_csv(path)
            dataframes.append(df)
        except FileNotFoundError:
            print(f"Arquivo {path} não encontrado.")
            continue
    
    concatenated_df = pd.concat(dataframes, ignore_index=True)
    
    try:
        concatenated_df.to_csv('output.csv', index=False)
        print("Arquivo CSV gerado com sucesso: output.csv")
    except Exception as e:
        print(f"Erro ao escrever arquivo CSV: {e}")


## Query

In [46]:
QUERY_TEMPLATE = '''
query search($queryString: String!, $perPage: Int!, $cursor: String) {
  search(query: $queryString, type: REPOSITORY, first: $perPage, after: $cursor) {
    edges {
      node {
        ... on Repository {
          name
          nameWithOwner
          owner {
            login
          }
          stargazers {
            totalCount
          }
          createdAt
          defaultBranchRef {
            name
            target {
              ... on Commit {
                history(first: 5) {
                  edges {
                    node {
                      message
                      committedDate
                      author {
                        name
                        email
                        date
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
    pageInfo {
      endCursor
      hasNextPage
    }
  }
}
'''

## Requests

In [47]:
per_page = 100
cursor = None
query_string = "language:java"
repositories = []

while len(repositories) < NUMBER_OF_REPOSITORIES:
    variables = {
        "queryString": query_string,
        "perPage": per_page,
        "cursor": cursor
    }
    
    data = doPost(data={'query': QUERY_TEMPLATE, 'variables': variables})

    if 'errors' in data:
        print("GraphQL query failed:", data['errors'])
        break
    
    # print(data)
    for edge in data['data']['search']['edges']:
        repositories.append(edge['node'])

    if data['data']['search']['pageInfo']['hasNextPage']:
        cursor = data['data']['search']['pageInfo']['endCursor']
    else:
        break

print("Total repositories: ", len(repositories))
print("Cursor: ", cursor)
print("Per page: ", per_page)

# data_brutus = pd.DataFrame(repositories)
# data_brutus.to_csv('dados_base.csv', index=False, sep=';')

Total repositories:  1000
Cursor:  Y3Vyc29yOjkwMA==
Per page:  100


## data processing

In [None]:
repositories[1]

In [48]:
dataFrame_tratado = pd.DataFrame()
dataFrame_tratado['Repositorio'] = [repo.get('nameWithOwner') for repo in repositories]
dataFrame_tratado['Estrelas'] = [repo.get('stargazers', {}).get('totalCount', 0) if isinstance(repo, dict) else 0 for repo in repositories]
dataFrame_tratado['Anos'] = analisar_createdAt(repositories)

dataFrame_tratado.head()

Unnamed: 0,Repositorio,Estrelas,Anos
0,Snailclimb/JavaGuide,141809,5.87
1,iluwatar/java-design-patterns,86082,9.61
2,MisterBooo/LeetCodeAnimation,74839,5.28
3,macrozheng/mall,74721,5.96
4,doocs/advanced-java,73809,5.45


## Script to Clone

In [None]:
destiny_path = ROOT_PATH
repo_teste = 'pabloaugustocm17/grafo'
clone_repo(dir_path=destiny_path, repo_name=repo_teste)
execute_ck('grafo')
# join_csv()s
delete_repo(repo_name='grafo')