In [135]:
import json
import pandas as pd
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

# **CREAR ÍNDICE**

In [138]:
# Lista de stop words adicionales (como "he", "him", "his", etc.)
custom_stop_words = [
    "he", "him", "his", "from", "been", "have", "after", "when", "you", "would", "just", "did", "it", "but", "could", "made", "how", "before"
]

# Combinar las stop words predeterminadas de Elasticsearch con tus stop words personalizadas
#all_stop_words = custom_stop_words + ["_english_"]

index_mapping = {
    "settings": {
        "analysis": {
        "analyzer": {
            "custom_analyzer": {
                "tokenizer": "standard",
                "filter": [
                    "lowercase_filter",
                    "custom_stop",
                    "english_stop",

                ]
            }
        },
        "filter": {
            "lowercase_filter": {
                "type": "lowercase"
            },
            "custom_stop": {
                "type":       "stop",
                "stopwords": custom_stop_words
            },
            "english_stop": {
                "type":       "stop",
                "stopwords":  "_english_"
            }
        }
    }
    },
    "mappings": {
        "properties": {
            "ranking_date": {"type": "date", "format": "yyyy-MM-dd"},
            "pilot_name": {"type": "keyword"},
            "position": {"type": "integer"},
            "pilot_review": {"type": "text", 
                             "analyzer": "custom_analyzer",  # Usa el analizador personalizado
                             "fielddata": True  # Habilita fielddata para permitir agregaciones
                            },
            "ranking_link": {"type": "text"},
        }
    }
}

try:
    es.indices.create(index="my_index", body=index_mapping)
except Exception as e:
    print(f"Error: {e}")

In [137]:
es.indices.delete(index='my_index')

ObjectApiResponse({'acknowledged': True})

# **INDEXAR CSV**

In [139]:
csv = pd.read_csv("data/unstructured/rankings_info.csv")
df = pd.DataFrame(csv)

In [140]:
for index in range(len(df)):
    doc = df.iloc[index].to_dict()
    res_insert = es.index(index="my_index", id=index+1, document=doc)

In [141]:
print(json.dumps(res_insert.body, indent=3)) # el último insertado

{
   "_index": "my_index",
   "_type": "_doc",
   "_id": "940",
   "_version": 1,
   "result": "created",
   "_shards": {
      "total": 2,
      "successful": 1,
      "failed": 0
   },
   "_seq_no": 939,
   "_primary_term": 1
}


In [142]:
es.get(index='my_index', id=455)

ObjectApiResponse({'_index': 'my_index', '_type': '_doc', '_id': '455', '_version': 1, '_seq_no': 454, '_primary_term': 1, 'found': True, '_source': {'ranking_date': '2022-11-23', 'pilot_name': 'Esteban Ocon', 'position': 5, 'pilot_review': 'Esteban Ocon qualified a solid eighth behind midfield rival Norris and the Alpine driver pretty much stayed right there for most of the race, his pace sufficient to prevent Vettel and team mate Fernando Alonso from catching up – but insufficient to catch Norris ahead. What was destined to be a P8 finish turned into P7 when Hamilton retired. ', 'ranking_link': 'https://www.formula1.com/en/latest/article/power-rankings-the-scores-come-in-from-the-final-race-of-2022-in-abu-dhabi.6cTDupNjqw80BGLYTEAymr'}})

# **CONSULTAS**

In [143]:
res = es.search(index='my_index', body={
     'query': {
         'range': {
             'position': {
                 'gte': 1,
                 'lte': 3
             }
         }
     },
     'size': 1
})

# print(json.dumps(res.body["hits"]["hits"][0]["_source"], indent=4))
print(json.dumps(res.body, indent=4))

{
    "took": 260,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 282,
            "relation": "eq"
        },
        "max_score": 1.0,
        "hits": [
            {
                "_index": "my_index",
                "_type": "_doc",
                "_id": "32",
                "_score": 1.0,
                "_source": {
                    "ranking_date": "2024-09-04",
                    "pilot_name": "Oscar Piastri",
                    "position": 2,
                    "pilot_review": "Piastri was understandably frustrated when the chequered flag dropped on Sunday, having fallen 2.6 seconds short of Leclerc after making two stops to the Monegasque\u2019s one. Before that, the Australian had done everything right, pulling off one of the moves of the season on Norris at the start and then controlling the race until McLaren and Ferrari we

In [144]:
es.close()