# Natural Language Processing (NLP):

###### **NLTK**: Natural Language Toolkit for various NLP tasks.
###### **spaCy**: Industrial-strength NLP library focusing on efficiency.
###### **gensim**: Topic modeling and document similarity library.
###### **Transformers (Hugging Face)**: Pre-trained NLP models and libraries.

## NLTK

import nltk  
from nltk.corpus import stopwords, wordnet  
from nltk.tokenize import word_tokenize, sent_tokenize  
from nltk.stem import WordNetLemmatizer  
from nltk.probability import FreqDist  
from nltk.tag import pos_tag  
from nltk.chunk import ne_chunk  
from nltk.translate import bleu_score  
from nltk.sentiment import SentimentIntensityAnalyzer  
from nltk.parse import CoreNLPParser  
from nltk.translate.meteor_score import single_meteor_score  

#### 1. Downloading NLTK Resources (One-time)
nltk.download('punkt')  
nltk.download('stopwords')  
nltk.download('averaged_perceptron_tagger')  
nltk.download('wordnet')  
nltk.download('maxent_ne_chunker')  
nltk.download('words')  
nltk.download('vader_lexicon')  

#### 2. Tokenization
text = "NLTK is a powerful library for natural language processing."  
tokens = word_tokenize(text)  

#### 3. Stopword Removal
stop_words = set(stopwords.words('english'))  
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]  

#### 4. Lemmatization
lemmatizer = WordNetLemmatizer()  
lemmatized_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in filtered_tokens]  

#### 5. Sentence Tokenization
sentences = sent_tokenize(text)  

#### 6. Part-of-Speech Tagging
pos_tags = pos_tag(tokens)  

#### 7. Named Entity Recognition
ne_tree = ne_chunk(pos_tags)  

#### 8. Frequency Distribution
fdist = FreqDist(tokens)  

#### 9. Sentiment Analysis
sia = SentimentIntensityAnalyzer()  
sentiment_score = sia.polarity_scores(text)  

#### 10. Machine Translation Evaluation (BLEU and METEOR)
reference = ["NLTK is a powerful library for natural language processing."]  
hypothesis = "NLTK is a great tool for NLP tasks."  
bleu_score_value = bleu_score.sentence_bleu([reference], hypothesis)  
meteor_score_value = single_meteor_score(reference[0], hypothesis)  

#### Optional: Dependency Parsing
parser = CoreNLPParser(url='http://localhost:9000')  
parsed_tree = next(parser.raw_parse(text))  

#### Optional: Chunking and Chinking
grammar = r"""  
  NP: {<DT>?<JJ>*<NN>}   # Chunk sequences of DT, JJ, NN  
  NP: {<DT>?<NNP>+}      # Chunk sequences of DT, NNP  
  """  
chunk_parser = nltk.RegexpParser(grammar)  
chunked_tree = chunk_parser.parse(pos_tags)  


## spaCy

import spacy  
from spacy import displacy  
from spacy.matcher import Matcher  
from spacy.tokens import Doc  
from spacy.lang.en.stop_words import STOP_WORDS  
from collections import Counter  

#### 1. Loading spaCy's English NLP Model
nlp = spacy.load("en_core_web_sm")  

#### 2. Tokenization and Part-of-Speech Tagging
text = "spaCy is a popular library for natural language processing."  
doc = nlp(text)  
for token in doc:  
    print(token.text, token.pos_)  

#### 3. Named Entity Recognition
for ent in doc.ents:  
    print(ent.text, ent.label_)  

#### 4. Dependency Parsing
for token in doc:  
    print(token.text, token.dep_, token.head.text, token.head.pos_)  

#### 5. Visualization with displaCy
displacy.render(doc, style="dep", jupyter=True, options={'distance': 90})  

#### 6. Stopword Removal
filtered_tokens = [token.text for token in doc if token.text.lower() not in STOP_WORDS]  

#### 7. Lemmatization
lemmatized_words = [token.lemma_ for token in doc]  

#### 8. Sentence Segmentation
sentences = [sent.text for sent in doc.sents]  

#### 9. Matching with spaCy's Matcher
matcher = Matcher(nlp.vocab)  
pattern = [{"LOWER": "popular"}, {"LOWER": "library"}]  
matcher.add("PopularLibrary", [pattern])  
matches = matcher(doc)  

#### 10. Frequency Analysis with Counter
word_counter = Counter([token.text.lower() for token in doc])  

#### Optional: Custom Rule-Based Entity Recognition
def add_custom_entities(doc):  
    doc.ents = [ent for ent in doc.ents if not ent.text.startswith('@')]  
    return doc  

Doc.set_extension("custom_entities", getter=add_custom_entities)  
doc_with_custom_ents = nlp("I'm talking about @spacy!")  

print(doc_with_custom_ents._.custom_entities)  


## gensim

import gensim  
from gensim.models import Word2Vec, KeyedVectors  
from gensim.corpora import Dictionary  
from gensim.utils import simple_preprocess  
from gensim.models import TfidfModel  
from gensim.similarities import MatrixSimilarity  
from gensim.summarization import summarize  
from gensim.models import LsiModel  
from gensim.models import CoherenceModel  
from gensim.models.doc2vec import Doc2Vec, TaggedDocument  
from gensim.models.phrases import Phrases, Phraser  
from gensim.models.word2vec import LineSentence  

#### 1. Word Embeddings with Word2Vec
sentences = [["machine", "learning", "is", "fun"], ["natural", "language", "processing", "is", "challenging"]]  
model_w2v = Word2Vec(sentences, vector_size=50, window=2, min_count=1, sg=0)  

#### 2. Loading Pre-trained Word Vectors
pretrained_model = KeyedVectors.load_word2vec_format('path/to/pretrained/model.bin', binary=True)  

#### 3. Document Tokenization
documents = ["Gensim is a popular library for text analysis.", "It includes various NLP algorithms."]  
tokenized_docs = [simple_preprocess(doc) for doc in documents]  

#### 4. Creating a Dictionary and Bag-of-Words
dictionary = Dictionary(tokenized_docs)  
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]  

#### 5. TF-IDF Model
tfidf = TfidfModel(corpus)  
tfidf_weights = tfidf[corpus]  

#### 6. Similarity Matrix
similarity_matrix = MatrixSimilarity(tfidf_weights)  

#### 7. Text Summarization
text = "Gensim is a Python library for topic modeling and document similarity analysis."  
summary = summarize(text)  

#### 8. Latent Semantic Indexing (LSI)
lsi_model = LsiModel(tfidf_weights, id2word=dictionary, num_topics=2)  
lsi_topics = lsi_model.show_topics()  

#### 9. Coherence Model
coherence_model = CoherenceModel(model=lsi_model, texts=tokenized_docs, dictionary=dictionary, coherence='c_v')  

#### 10. Document Embeddings with Doc2Vec
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_docs)]  
model_doc2vec = Doc2Vec(tagged_documents, vector_size=50, window=2, min_count=1, epochs=10)  

#### Optional: Phrase Detection
phrases = Phrases(tokenized_docs, min_count=1, threshold=1)  
bigram = Phraser(phrases)  
tokenized_with_phrases = [bigram[doc] for doc in tokenized_docs]  

#### Optional: Training Word2Vec on Large Corpus
sentences = LineSentence('path/to/large/corpus.txt')  
model_large_corpus = Word2Vec(sentences, vector_size=100, window=5, min_count=10, sg=0)  


## Transformers (Hugging Face)

import torch  
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig  
from transformers import BertTokenizer, BertForSequenceClassification, BertForMaskedLM  
from transformers import T5Tokenizer, T5ForConditionalGeneration  
from transformers import GPT2LMHeadModel, GPT2Tokenizer  

#### 1. Text Generation with GPT-2
generator = GPT2LMHeadModel.from_pretrained("gpt2")  
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  
input_text = "Once upon a time"  
input_ids = tokenizer.encode(input_text, return_tensors="pt")  
output = generator.generate(input_ids, max_length=50, num_return_sequences=5)  
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)  

#### 2. Question Answering with BERT
question_answering = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased")  
context = "Hugging Face is a technology company based in New York."  
question = "Where is Hugging Face based?"  
result = question_answering(question=question, context=context)  
answer = result['answer']  

#### 3. Sentiment Analysis with BERT
sentiment_classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", tokenizer="nlptown/bert-base-  multilingual-uncased-sentiment")  
text = "I love using the Transformers library!"  
sentiment = sentiment_classifier(text)[0]['label']  

#### 4. Named Entity Recognition with BERT
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english")  
text = "Apple Inc. is a technology company."  
entities = ner_pipeline(text)  

#### 5. Token Classification with BERT
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")  
model = BertForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")  
text = "Hugging Face is a great NLP library."  
inputs = tokenizer(text, return_tensors="pt")  
outputs = model(**inputs)  
predicted_labels = torch.argmax(outputs.logits, dim=2)  
predicted_entities = [tokenizer.decode(token_ids) for token_ids in predicted_labels[0]]  

#### 6. Masked Language Modeling with BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  
model = BertForMaskedLM.from_pretrained("bert-base-uncased")  
text = "I want to [MASK] the world."  
inputs = tokenizer(text, return_tensors="pt")  
outputs = model(**inputs)
masked_token_logits = outputs.logits[0, inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)]  

#### 7. Sequence Classification with BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")  
text = "Hugging Face is a great NLP library."  
inputs = tokenizer(text, return_tensors="pt")  
outputs = model(**inputs)  
predicted_class = torch.argmax(outputs.logits)  

#### 8. Summarization with T5
summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")  
article = "Hugging Face is a company specializing in Natural Language Processing technologies."  
summary = summarizer(article, max_length=50, min_length=10)[0]['summary_text']  

#### 9. Translation with MarianMT
translator = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de")  
text = "Transformers is a versatile library for natural language processing."  
translated_text = translator(text)[0]['translation_text']  

#### 10. Custom Model and Tokenizer Configuration
config = AutoConfig.from_pretrained("bert-base-uncased")  
model = AutoModelForSequenceClassification.from_config(config)  
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  
