# Extracting and ranking keywords from PyPI project descriptions

## Creation of a raw content dataset

As a first example, let's create a dataset of descriptions from ~70000 PyPI projects previously aggregated using Selinon and link each project description to its trove classifiers and keywords from PyPI.
The dataset should contain the following information:

``Project name (str) | Project description (str) | Project PyPI classifiers (list[str]) or Project PyPI keywords (list[str])``

In [39]:
import csv
import json
import math
import nltk
from nltk.corpus import stopwords
import os
from pprint import PrettyPrinter
import re
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
from typing import Dict
from typing import List

In [40]:
_DATA_PATH = "data"
_DATASETS_SAVING_PATH = "datasets/"

### Assembling and saving the datasets:

In [41]:
def get_projects_with_descriptors_dataset(descriptors: str) -> Dict[str, list]:
    """Retrieve projects with PyPI classifiers or keywords from data aggregated with Selinon."""
    projects_with_descriptors_dataset = {}
    for root, dirs, files in os.walk(os.path.join(".", _DATA_PATH)):
        for file in files:
            with open(os.path.join(".", _DATA_PATH, file), "r") as json_file:
                file_content = json.loads(json_file.read())
                if descriptors == "classifiers":
                    projects_with_descriptors_dataset[file] = [file_content["info"]["description"], file_content["info"]["classifiers"]]
                elif descriptors == "keywords":
                    projects_with_descriptors_dataset[file] = [file_content["info"]["description"], file_content["info"]["keywords"]]
                else:
                    raise ValueError("Invalid descriptors name specified")

    return projects_with_descriptors_dataset

In [42]:
# Create a subsample of the datasets with classifiers and keywords

import random

NUMBER_OF_PROJECTS = 1000

projects_with_classifiers_dataset = get_projects_with_descriptors_dataset("classifiers")

random_keys = projects_with_classifiers_dataset.keys()
random.shuffle(list(projects_with_classifiers_dataset.keys()))
random_keys = list(random_keys)[:NUMBER_OF_PROJECTS]

with open(os.path.join(_DATASETS_SAVING_PATH, "projects_with_classifiers_dataset.csv"), "w") as csv_file:
    writer = csv.writer(csv_file)
    for project_name in random_keys:
        writer.writerow([project_name, projects_with_classifiers_dataset[project_name][0], projects_with_classifiers_dataset[project_name][1]])
        

projects_with_keywords_dataset = get_projects_with_descriptors_dataset("keywords")

with open(os.path.join(_DATASETS_SAVING_PATH, "projects_with_keywords_dataset.csv"), "w") as csv_file:
    writer = csv.writer(csv_file)
    for project_name in random_keys:
        writer.writerow([project_name, projects_with_keywords_dataset[project_name][0], projects_with_keywords_dataset[project_name][1]])

We have now built two datasets containing each project's name and decsription, followed by corresponding classifiers or keywords. 

## Building a dataset with the most important tokens per package

This step consists in building a dataset that contains tokens of the description for each package ranked by importance based on the [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) tokens ranking method. 

TF-IDF (for Term Frequency - Inverse Document Frequency) is a statistical method to find the most important terms of a document from a document corpus, i.e. a set of documents. The technique used relies on an analysis of the frequency of terms in tokenized and pre-processed documents, where the most frequent terms across documents of the corpus are penalized to highlight the terms appearing more frequently in the studied document, and therefore to identify the theme of the analyzed text. The corpus of documents considered in this analysis is the set of descriptions from PyPI Python packages which are described with at least one classifier on the PyPI index. 

This dataset could be used to train a model that will associate the most important words of a project description to the project's classifiers, which should allow to classify projects that are not explicitly classified according to their description.

### Text pre-processing

The first step of the descriptions text preprocessing is to filter them in order to extract only relevant words (removing stopwords) and remove non alphanumerical characters.

In [43]:
def preprocess_dataset(dataset_name: str) -> Dict[str, list]:
    preprocessed_dataset = {}
    nltk.download('stopwords')
    stopwords_complete_list = stopwords.words('english') + [stopword.capitalize() for stopword in stopwords.words('english')]

    with open(os.path.join(_DATASETS_SAVING_PATH, dataset_name), "r") as csv_file:
        reader = csv.reader(csv_file)
        for row in reader:
            preprocessed_description = list(filter(str.isalnum, row[1].split(" ")))
            preprocessed_description = [word.lower() for word in preprocessed_description if word not in stopwords_complete_list]
            preprocessed_dataset[row[0]] = [preprocessed_description, row[2]]

    return preprocessed_dataset

In [44]:
# Increasing field size limit to avoid related errors:
csv.field_size_limit(sys.maxsize)

9223372036854775807

Let's build a dataset containing only the project names and descriptions

In [53]:
preprocessed_projects_with_classifiers_dataset = preprocess_dataset("projects_with_classifiers_dataset.csv")

# Print the dataset:
pp = PrettyPrinter(indent=2)
pp.pprint(preprocessed_projects_with_classifiers_dataset)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mcostant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{ '0': [ [],
         "['Development Status :: 1 - Planning', 'Intended Audience :: "
         "Developers', 'License :: OSI Approved :: MIT License', 'Operating "
         "System :: OS Independent', 'Programming Language :: Python :: 2', "
         "'Programming Language :: Python :: 2.6', 'Programming Language :: "
         "Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming "
         "Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', "
         "'Programming Language :: Python :: 3.6']"],
  '0-0': [['unknown'], '[]'],
  '0-0-1': [ ['lib', 'creating', 'python', 'create', 'cmd'],
             "['Development Status :: 3 - Alpha', 'Intended Audience :: "
             "Developers', 'License :: OSI Approved :: GNU General Public "
             "License v3 (GPLv3)', 'Programming Language :: Python :: 3.5', "
             "'Topic :: Software Development :: Build Tools']"],
  '0-core-client': [ [ 'python',
                       'install',
              

In [46]:
preprocessed_projects_with_keywords_dataset = preprocess_dataset("projects_with_keywords_dataset.csv")

# Print the dataset:
# pp = PrettyPrinter(indent=2)
# pp.pprint(preprocessed_projects_with_keywords_dataset)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mcostant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Then, we want to compute the TF-IDF score of each term in each document description:

In [52]:
def tf_idf(corpus: Dict[str, list]) -> Dict[str, Dict[str, float]]:
    """Generate a dictionary of tf-idf words scores per document in the corpus."""
    tf_idf_result = {}

    corpus_vocabulary = []
    for package, description in corpus.items():
        corpus_vocabulary += description

    for package in corpus.keys():
        description = corpus[package][0]
        for term in description:
            tf_document = description.count(term)
            tf_corpus = corpus_vocabulary.count(term)
            if tf_corpus != 0:
                idf = math.log10(len(corpus)/tf_corpus)

                if package not in tf_idf_result.keys():
                    tf_idf_result[package] = {}
                    tf_idf_result[package][term] = tf_document * idf

                else:
                    if term not in tf_idf_result[package].keys():
                        tf_idf_result[package][term] = tf_document * idf
            
    return tf_idf_result

Applying TF-IDF to the datasets:

In [48]:
tf_idf_result_classifiers = tf_idf(preprocessed_projects_with_classifiers_dataset)
tf_idf_result_keywords = tf_idf(preprocessed_projects_with_keywords_dataset)

['aa']
[]
['integration', 'target', 'make', 'airtable', 'import', 'easier', 'also', 'set', 'project', 'please', 'migrate', 'app', 'migrate', 'airtable', 'api', 'databases', 'database', 'settings', 'table', 'name', 'path', 'table', 'airtable', 'api', 'endpoint', 'folder', 'json', 'data', 'folder', 'uploaded', 'files', 'library', 'save', 'uploaded', 'files', 'import', 'class', 'model', 'django', 'python']
['unknown']
['unknown']
['51degrees', 'mobile', 'detector', 'trie', 'mobile', 'detector', 'python', 'wrapper', 'c', 'solution', 'check', 'detailed', 'extra', 'documentation', 'useful', '2013', 'see', 'see']
['python', 'install', 'import', 'core0', 'u', 'stuff', 'exposes', 'tools', 'container']
['lib', 'creating', 'python', 'create', 'cmd']
['deploy', 'packages']
['python', 'python', 'client', 'used', 'talk', '0', 'rest', 'install', 'import', 'c', 'update', 'client', 'raml', 'client', 'python']
['unknown']
['implemenation', 'sbst', 'standard', 'python', 'found', 'quite', 'inconvenient', 

In [49]:
pp = PrettyPrinter(indent=2)
pp.pprint(tf_idf_result_classifiers)
pp.pprint(tf_idf_result_keywords)

{}
{ '0-0-1': {'python': 2.6989700043360187},
  '0-core-client': {'python': 2.6989700043360187},
  '0-orchestrator': {'python': 8.096910013008056},
  '090807040506030201testpip': {'python': 2.6989700043360187},
  '0wned': {'python': 13.494850021680094, 'test': 3.0},
  '0x-order-utils': {'python': 2.6989700043360187},
  '0x-sra-client': {'ethereum': 6.0, 'python': 10.795880017344075},
  '0x-web3': { 'add': 3.0,
               'chat': 6.0,
               'python': 16.193820026016112,
               'test': 21.0},
  '115wangpan': {'python': 8.096910013008056, 'tasks': 7.568636235841012},
  '12factor-vault': {'python': 2.6989700043360187},
  '153957-theme': {'add': 3.0},
  '15five-django-ajax-selects': {'add': 3.0, 'python': 2.6989700043360187},
  '15five-snowplow-tracker': {'python': 10.795880017344075},
  '17monip': {'python': 2.6989700043360187},
  '1c-utilites': {'python': 2.6989700043360187},
  '1pass': {'add': 3.0, 'python': 2.6989700043360187},
  '1to001': {'python': 2.6989700043360

Normalizing and ranking the vectorized tokens per project description:

In [50]:
# Normalization: we determine the minimal and maximal values of the tf-idf scores for the corpus vocabulary

corpus_vocabulary_classifiers_min_value, corpus_vocabulary_classifiers_max_value = float("inf"), 0
for package, tokens_with_scores in tf_idf_result_classifiers.items():
    print(tokens_with_scores)
    for token, score in tokens_with_scores.items():
        if score <= corpus_vocabulary_classifiers_min_value:
            corpus_vocabulary_classifiers_min_value = score
        elif score >= corpus_vocabulary_classifiers_max_value:
            corpus_vocabulary_classifiers_max_value = score

print(corpus_vocabulary_classifiers_min_value, corpus_vocabulary_classifiers_max_value)

inf 0
